Function to clean student drop out data

# Import the data 
student_data <- read_csv(file = 'predict_students_dropout_and_academic_success_final.csv')
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Target
## dbl (36): Marital_status, Application_mode, Application_order, Course, Dayti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove "Enrolled" status 
student_data_filtered <- student_data[student_data$Target != "Enrolled", ]

student_data_filtered <- student_data_filtered %>%
  mutate(Marital_status = case_when(
    Marital_status == 1 ~ "single",
    Marital_status == 2 ~ "married",
    Marital_status == 3 ~ "widower",
    Marital_status == 4 ~ "divorced",
    Marital_status == 5 ~ "facto union",
    Marital_status == 6 ~ "legally separated",
    TRUE ~ as.character(Marital_status)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Application_mode = case_when(
    Application_mode == 1 ~ "1st phase general contingent",
    Application_mode == 2 ~ "Ordinance No. 612/93",
    Application_mode == 5 ~ "1st phase special contingent (Azores Island)",
    Application_mode == 7 ~ "Holders of other higher courses",
    Application_mode == 10 ~ "Ordinance No. 854-B/99",
    Application_mode == 15 ~ "International student (bachelor)",
    Application_mode == 16 ~ "1st phase special contingent (Madeira Island)",
    Application_mode == 17 ~ "2nd phase general contingent",
    Application_mode == 18 ~ "3rd phase general contingent",
    Application_mode == 26 ~ "Ordinance No. 533 A 99, item b2) (Different Plan)",
    Application_mode == 27 ~ "Ordinance No. 533 A 99, item b3 (Other Institution)",
    Application_mode == 39 ~ "Over 23 years old",
    Application_mode == 42 ~ "Transfer",
    Application_mode == 43 ~ "Change of course",
    Application_mode == 44 ~ "Technological specialization diploma holders",
    Application_mode == 51 ~ "Change of institution/course",
    Application_mode == 53 ~ "Short cycle diploma holders",
    Application_mode == 57 ~ "Change of institution/course (International)",
    TRUE ~ as.character(Application_mode)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Course = case_when(
    Course == 33 ~ "Biofuel Production Technologies",
    Course == 171 ~ "Animation and Multimedia Design",
    Course == 8014 ~ "Social Service (evening attendance)",
    Course == 9003 ~ "Agronomy",
    Course == 9070 ~ "Communication Design",
    Course == 9085 ~ "Veterinary Nursing",
    Course == 9119 ~ "Informatics Engineering",
    Course == 9130 ~ "Equinculture",
    Course == 9147 ~ "Management",
    Course == 9238 ~ "Social Service",
    Course == 9254 ~ "Tourism",
    Course == 9500 ~ "Nursing",
    Course == 9556 ~ "Oral Hygiene",
    Course == 9670 ~ "Advertising and Marketing Management",
    Course == 9773 ~ "Journalism and Communication",
    Course == 9853 ~ "Basic Education",
    Course == 9991 ~ "Management (evening attendance)",
    TRUE ~ as.character(Course)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Daytime_evening_attendance = case_when(
    Daytime_evening_attendance == 1 ~ "daytime",
    Daytime_evening_attendance == 0 ~ "evening",
    TRUE ~ as.character(Daytime_evening_attendance)
  ))


student_data_filtered <- student_data_filtered %>%
  mutate(Previous_qualification = case_when(
    Previous_qualification == 1 ~ "Secondary education",
    Previous_qualification == 2 ~ "Higher education bachelors degree",
    Previous_qualification == 3 ~ "Higher education degree",
    Previous_qualification == 4 ~ "Higher education masters",
    Previous_qualification == 5 ~ "Higher education doctorate",
    Previous_qualification == 6 ~ "Frequency of higher education",
    Previous_qualification == 9 ~ "12th year of schooling not completed",
    Previous_qualification == 10 ~ "11th year of schooling not completed",
    Previous_qualification == 12 ~ "Other 11th year of schooling",
    Previous_qualification == 14 ~ "10th year of schooling",
    Previous_qualification == 15 ~ "10th year of schooling not completed",
    Previous_qualification == 19 ~ "Basic education 3rd cycle (9th 10th 11th year) or equiv.",
    Previous_qualification == 38 ~ "Basic education 2nd cycle (6th 7th 8th year) or equiv.",
    Previous_qualification == 39 ~ "Technological specialization course",
    Previous_qualification == 40 ~ "Higher education degree (1st cycle)",
    Previous_qualification == 42 ~ "Professional higher technical course",
    Previous_qualification == 43 ~ "Higher education master (2nd cycle)",
    TRUE ~ as.character(Previous_qualification)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Nationality = case_when(
    Nationality == 1 ~ "Portuguese",
    Nationality == 2 ~ "German",
    Nationality == 6 ~ "Spanish",
    Nationality == 11 ~ "Italian",
    Nationality == 13 ~ "Dutch",
    Nationality == 14 ~ "English",
    Nationality == 17 ~ "Lithuanian",
    Nationality == 21 ~ "Angolan",
    Nationality == 22 ~ "Cape Verdean",
    Nationality == 24 ~ "Guinean",
    Nationality == 25 ~ "Mozambican",
    Nationality == 26 ~ "Santomean",
    Nationality == 32 ~ "Turkish",
    Nationality == 41 ~ "Brazilian",
    Nationality == 62 ~ "Romanian",
    Nationality == 100 ~ "Moldova (Republic of)",
    Nationality == 101 ~ "Mexican",
    Nationality == 103 ~ "Ukrainian",
    Nationality == 105 ~ "Russian",
    Nationality == 108 ~ "Cuban",
    Nationality == 109 ~ "Colombian",
    TRUE ~ as.character(Nationality)
  ))


student_data_filtered <- student_data_filtered %>%
  mutate(Mothers_qualification = case_when(
    Mothers_qualification == 1 ~ "Secondary Education 12th Year of Schooling or Eq.",
    Mothers_qualification == 2 ~ "Higher Education Bachelors Degree",
    Mothers_qualification == 3 ~ "Higher Education Degree",
    Mothers_qualification == 4 ~ "Higher Education Masters",
    Mothers_qualification == 5 ~ "Higher Education Doctorate",
    Mothers_qualification == 6 ~ "Frequency of Higher Education",
    Mothers_qualification == 9 ~ "12th Year of Schooling Not Completed",
    Mothers_qualification == 10 ~ "11th Year of Schooling Not Completed",
    Mothers_qualification == 11 ~ "7th Year (Old)",
    Mothers_qualification == 12 ~ "Other 11th Year of Schooling",
    Mothers_qualification == 14 ~ "10th Year of Schooling",
    Mothers_qualification == 18 ~ "General Commerce Course",
    Mothers_qualification == 19 ~ "Basic Education 3rd Cycle (9th 10th 11th Year) or Equiv.",
    Mothers_qualification == 22 ~ "Technical-Professional Course",
    Mothers_qualification == 26 ~ "7th Year of Schooling",
    Mothers_qualification == 27 ~ "2nd Cycle of the General High School Course",
    Mothers_qualification == 29 ~ "9th Year of Schooling Not Completed",
    Mothers_qualification == 30 ~ "8th Year of Schooling",
    Mothers_qualification == 34 ~ "Unknown",
    Mothers_qualification == 35 ~ "Cant Read or Write",
    Mothers_qualification == 36 ~ "Can Read Without Having a 4th Year of Schooling",
    Mothers_qualification == 37 ~ "Basic Education 1st Cycle (4th 5th Year) or Equiv.",
    Mothers_qualification == 38 ~ "Basic Education 2nd Cycle (6th 7th 8th Year) or Equiv.",
    Mothers_qualification == 39 ~ "Technological Specialization Course",
    Mothers_qualification == 40 ~ "Higher Education Degree (1st Cycle)",
    Mothers_qualification == 41 ~ "Specialized Higher Studies Course",
    Mothers_qualification == 42 ~ "Professional Higher Technical Course",
    Mothers_qualification == 43 ~ "Higher Education Master (2nd Cycle)",
    Mothers_qualification == 44 ~ "Higher Education Doctorate (3rd Cycle)",
    TRUE ~ as.character(Mothers_qualification)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Fathers_qualification = case_when(
    Fathers_qualification == 1 ~ "Secondary Education 12th Year of Schooling or Eq",
    Fathers_qualification == 2 ~ "Higher Education Bachelors Degree",
    Fathers_qualification == 3 ~ "Higher Education Degree",
    Fathers_qualification == 4 ~ "Higher Education Masters",
    Fathers_qualification == 5 ~ "Higher Education Doctorate",
    Fathers_qualification == 6 ~ "Frequency of Higher Education",
    Fathers_qualification == 9 ~ "12th Year of Schooling Not Completed",
    Fathers_qualification == 10 ~ "11th Year of Schooling Not Completed",
    Fathers_qualification == 11 ~ "7th Year (Old)",
    Fathers_qualification == 12 ~ "Other 11th Year of Schooling",
    Fathers_qualification == 13 ~ "2nd Year Complementary High School Course",
    Fathers_qualification == 14 ~ "10th Year of Schooling",
    Fathers_qualification == 18 ~ "General Commerce Course",
    Fathers_qualification == 19 ~ "Basic Education 3rd Cycle (9th 10th 11th Year) or Equiv.",
    Fathers_qualification == 20 ~ "Complementary High School Course",
    Fathers_qualification == 22 ~ "Technical Professional Course",
    Fathers_qualification == 25 ~ "Complementary High School Course Not Concluded",
    Fathers_qualification == 26 ~ "7th Year of Schooling",
    Fathers_qualification == 27 ~ "2nd Cycle of the General High School Course",
    Fathers_qualification == 29 ~ "9th Year of Schooling Not Completed",
    Fathers_qualification == 30 ~ "8th Year of Schooling",
    Fathers_qualification == 31 ~ "General Course of Administration and Commerce",
    Fathers_qualification == 33 ~ "Supplementary Accounting and Administration",
    Fathers_qualification == 34 ~ "Unknown",
    Fathers_qualification == 35 ~ "Cant Read or Write",
    Fathers_qualification == 36 ~ "Can Read Without Having a 4th Year of Schooling",
    Fathers_qualification == 37 ~ "Basic Education 1st Cycle (4th 5th Year) or Equiv",
    Fathers_qualification == 38 ~ "Basic Education 2nd Cycle (6th 7th 8th Year) or Equiv",
    Fathers_qualification == 39 ~ "Technological Specialization Course",
    Fathers_qualification == 40 ~ "Higher Education Degree (1st Cycle)",
    Fathers_qualification == 41 ~ "Specialized Higher Studies Course",
    Fathers_qualification == 42 ~ "Professional Higher Technical Course",
    Fathers_qualification == 43 ~ "Higher Education Master (2nd Cycle)",
    Fathers_qualification == 44 ~ "Higher Education Doctorate (3rd Cycle)",
    TRUE ~ as.character(Fathers_qualification)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Mothers_occupation = case_when(
    Mothers_occupation == 0 ~ "Student",
    Mothers_occupation == 1 ~ "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
    Mothers_occupation == 2 ~ "Specialists in Intellectual and Scientific Activities",
    Mothers_occupation == 3 ~ "Intermediate Level Technicians and Professions",
    Mothers_occupation == 4 ~ "Administrative staff",
    Mothers_occupation == 5 ~ "Personal Services, Security and Safety Workers and Sellers",
    Mothers_occupation == 6 ~ "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
    Mothers_occupation == 7 ~ "Skilled Workers in Industry, Construction and Craftsmen",
    Mothers_occupation == 8 ~ "Installation and Machine Operators and Assembly Workers",
    Mothers_occupation == 9 ~ "Unskilled Workers",
    Mothers_occupation == 10 ~ "Armed Forces Professions",
    Mothers_occupation == 90 ~ "Other Situation",
    Mothers_occupation == 99 ~ "(blank)",
    Mothers_occupation == 122 ~ "Health professionals",
    Mothers_occupation == 123 ~ "Teachers",
    Mothers_occupation == 125 ~ "Specialists in Information and Communication Technologies (ICT)",
    Mothers_occupation == 131 ~ "Intermediate Level Science and Engineering Technicians and Professions",
    Mothers_occupation == 132 ~ "Technicians and Professionals of Intermediate Level of Health",
    Mothers_occupation == 134 ~ "Intermediate Level Technicians from Legal, Social, Sports, Cultural and Similar Services",
    Mothers_occupation == 141 ~ "Office Workers, Secretaries in General and Data Processing Operators",
    Mothers_occupation == 143 ~ "Data, Accounting, Statistical, Financial Services and Registry-Related Operators",
    Mothers_occupation == 144 ~ "Other Administrative Support Staff",
    Mothers_occupation == 151 ~ "Personal Service Workers",
    Mothers_occupation == 152 ~ "Sellers",
    Mothers_occupation == 153 ~ "Personal Care Workers and the Like",
    Mothers_occupation == 171 ~ "Skilled Construction Workers and the Like, Except Electricians",
    Mothers_occupation == 173 ~ "Skilled Workers in Printing, Precision Instrument Manufacturing, Jewelers, Artisans and the Like",
    Mothers_occupation == 175 ~ "Workers in Food Processing, Woodworking, Clothing and Other Industries and Crafts",
    Mothers_occupation == 191 ~ "Cleaning Workers",
    Mothers_occupation == 192 ~ "Unskilled Workers in Agriculture, Animal Production, Fisheries and Forestry",
    Mothers_occupation == 193 ~ "Unskilled Workers in Extractive Industry, Construction, Manufacturing and Transport",
    Mothers_occupation == 194 ~ "Meal Preparation Assistants",
    TRUE ~ as.character(Mothers_occupation)
  ))


student_data_filtered <- student_data_filtered %>%
  mutate(Fathers_occupation = case_when(
    Fathers_occupation == 0 ~ "Student",
    Fathers_occupation == 1 ~ "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
    Fathers_occupation == 2 ~ "Specialists in Intellectual and Scientific Activities",
    Fathers_occupation == 3 ~ "Intermediate Level Technicians and Professions",
    Fathers_occupation == 4 ~ "Administrative staff",
    Fathers_occupation == 5 ~ "Personal Services, Security and Safety Workers and Sellers",
    Fathers_occupation == 6 ~ "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
    Fathers_occupation == 7 ~ "Skilled Workers in Industry, Construction and Craftsmen",
    Fathers_occupation == 8 ~ "Installation and Machine Operators and Assembly Workers",
    Fathers_occupation == 9 ~ "Unskilled Workers",
    Fathers_occupation == 10 ~ "Armed Forces Professions",
    Fathers_occupation == 90 ~ "Other Situation",
    Fathers_occupation == 99 ~ "(blank)",
    Fathers_occupation == 101 ~ "Armed Forces Officers",
    Fathers_occupation == 102 ~ "Armed Forces Sergeants",
    Fathers_occupation == 103 ~ "Other Armed Forces personnel",
    Fathers_occupation == 112 ~ "Directors of Administrative and Commercial Services",
    Fathers_occupation == 114 ~ "Hotel, Catering, Trade, and Other Services Directors",
    Fathers_occupation == 121 ~ "Specialists in the Physical Sciences, Mathematics, Engineering, and Related Techniques",
    Fathers_occupation == 122 ~ "Health Professionals",
    Fathers_occupation == 123 ~ "Teachers",
    Fathers_occupation == 124 ~ "Specialists in Finance, Accounting, Administrative Organization, Public and Commercial Relations",
    Fathers_occupation == 131 ~ "Intermediate Level Science and Engineering Technicians and Professions",
    Fathers_occupation == 132 ~ "Technicians and Professionals of Intermediate Level of Health",
    Fathers_occupation == 134 ~ "Intermediate Level Technicians from Legal, Social, Sports, Cultural, and Similar Services",
    Fathers_occupation == 135 ~ "Information and Communication Technology Technicians",
    Fathers_occupation == 141 ~ "Office Workers, Secretaries in General and Data Processing Operators",
    Fathers_occupation == 143 ~ "Data, Accounting, Statistical, Financial Services, and Registry-Related Operators",
    Fathers_occupation == 144 ~ "Other Administrative Support Staff",
    Fathers_occupation == 151 ~ "Personal Service Workers",
    Fathers_occupation == 152 ~ "Sellers",
    Fathers_occupation == 153 ~ "Personal Care Workers and the Like",
    Fathers_occupation == 154 ~ "Protection and Security Services Personnel",
    Fathers_occupation == 161 ~ "Market-Oriented Farmers and Skilled Agricultural and Animal Production Workers",
    Fathers_occupation == 163 ~ "Farmers, Livestock Keepers, Fishermen, Hunters, and Gatherers, Subsistence",
    Fathers_occupation == 171 ~ "Skilled Construction Workers and the Like, Except Electricians",
    Fathers_occupation == 172 ~ "Skilled Workers in Metallurgy, Metalworking, and Similar",
    Fathers_occupation == 174 ~ "Skilled Workers in Electricity and Electronics",
    Fathers_occupation == 175 ~ "Workers in Food Processing, Woodworking, Clothing, and Other Industries and Crafts",
    Fathers_occupation == 181 ~ "Fixed Plant and Machine Operators",
    Fathers_occupation == 182 ~ "Assembly Workers",
    Fathers_occupation == 183 ~ "Vehicle Drivers and Mobile Equipment Operators",
    Fathers_occupation == 192 ~ "Unskilled Workers in Agriculture, Animal Production, Fisheries, and Forestry",
    Fathers_occupation == 193 ~ "Unskilled Workers in Extractive Industry, Construction, Manufacturing, and Transport",
    Fathers_occupation == 194 ~ "Meal Preparation Assistants",
    Fathers_occupation == 195 ~ "Street Vendors (Except Food) and Street Service Providers",
    TRUE ~ as.character(Fathers_occupation)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Displaced = case_when(
    Displaced == 1 ~ "yes",
    Displaced == 0 ~ "no",
    TRUE ~ as.character(Displaced)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Educational_special_needs = case_when(
    Educational_special_needs == 1 ~ "yes",
    Educational_special_needs == 0 ~ "no",
    TRUE ~ as.character(Educational_special_needs)
  ))
  
student_data_filtered <- student_data_filtered %>%
  mutate(Debtor = case_when(
    Debtor == 1 ~ "yes",
    Debtor == 0 ~ "no",
    TRUE ~ as.character(Debtor)
  ))
    
student_data_filtered <- student_data_filtered %>%
  mutate(Tuition_fees_up_to_date = case_when(
    Tuition_fees_up_to_date == 1 ~ "yes",
    Tuition_fees_up_to_date == 0 ~ "no",
    TRUE ~ as.character(Tuition_fees_up_to_date)
  ))

    
student_data_filtered <- student_data_filtered %>%
  mutate(Gender = case_when(
    Gender == 1 ~ "male",
    Gender == 0 ~ "female",
    TRUE ~ as.character(Gender)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(Scholarship_holder = case_when(
    Scholarship_holder == 1 ~ "yes",
    Scholarship_holder == 0 ~ "no",
    TRUE ~ as.character(Scholarship_holder)
  ))

student_data_filtered <- student_data_filtered %>%
  mutate(International = case_when(
    International == 1 ~ "yes",
    International == 0 ~ "no",
    TRUE ~ as.character(International)
  ))


write_xlsx(student_data_filtered, "student_data_filtered_final.xlsx")