This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.
When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:
# Library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(e1071)
library(MASS)
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
library(caTools)
library(psych)
##
## Attaching package: 'psych'
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(corrplot)
## corrplot 0.95 loaded
library(biotools)
## ---
## biotools version 4.3
library(ResourceSelection)
## ResourceSelection 0.3-6 2023-06-27
library(car)
## Loading required package: carData
##
## Attaching package: 'car'
##
## The following object is masked from 'package:psych':
##
## logit
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
##
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
# Load Data
data <- read.csv("Poblacion.csv", sep = ";", header = TRUE)
colnames(data) <- make.names(colnames(data))
data$target <- as.factor(data$student_dropout)
# Jumlah missing value per kolom
cat("Jumlah missing value per kolom:\n")
## Jumlah missing value per kolom:
missing_counts <- colSums(is.na(data))
print(missing_counts[missing_counts > 0])
## named numeric(0)
# Total missing value di seluruh dataset
cat("\nTotal missing value di seluruh dataset:", sum(is.na(data)), "\n")
##
## Total missing value di seluruh dataset: 0
# Cek baris duplikat
cat("\nJumlah baris duplikat:\n")
##
## Jumlah baris duplikat:
duplicate_rows <- duplicated(data)
cat("Total baris duplikat:", sum(duplicate_rows), "\n")
## Total baris duplikat: 0
# Struktur dan ringkasan
str(data)
## 'data.frame': 2866 obs. of 63 variables:
## $ min_periodo : int 202020 202010 202010 202010 202010 202020 202010 202010 202020 202020 ...
## $ max_periodo : int 202120 202020 202120 202120 202120 202020 202120 202120 202120 202120 ...
## $ can_periodo : int 3 2 3 5 5 1 5 4 3 4 ...
## $ gender : int 1 1 2 1 1 2 1 1 1 1 ...
## $ birth_date : int 1999 1999 2000 1992 2000 1988 1988 1996 1989 1991 ...
## $ birth_month : int 4 1 8 9 3 12 4 11 11 2 ...
## $ start_age : int 22 22 21 22 20 20 20 20 21 21 ...
## $ ethnicity : int 3 5 1 3 2 3 4 2 4 4 ...
## $ marital_status : int 6 6 6 6 6 6 4 6 6 4 ...
## $ citizenship : int 1 1 1 1 1 1 1 1 1 1 ...
## $ status_academy : int 16 8 16 16 16 32 16 16 16 16 ...
## $ start_semester : int 202020 202010 202010 202010 202010 202020 202010 202010 202020 202020 ...
## $ last_activity : int 202120 202020 202120 202120 202120 202020 202120 202120 202120 202120 ...
## $ degree : int 26 45 8 34 32 10 9 16 12 21 ...
## $ faculty : int 16 18 19 14 10 19 12 19 13 10 ...
## $ num_career : int 1 2 2 1 1 1 1 1 2 2 ...
## $ num_subject : int 999 999 999 999 999 999 999 999 999 999 ...
## $ max_note_semester : int 3 2 3 4 3 4 4 4 3 4 ...
## $ min_note_semester : int 3 1 3 4 3 4 3 4 3 4 ...
## $ number_payment : int 102 116 0 0 30 0 0 104 0 24 ...
## $ last_payment_year : int 2020 2019 999 999 2019 999 999 2019 999 2020 ...
## $ note_last_semester : int 3 0 4 4 0 4 3 0 0 0 ...
## $ school_preparation_problem : int 2 2 2 2 2 2 2 2 2 2 ...
## $ academy_performance_problem: int 0 1 0 0 0 0 0 0 0 0 ...
## $ academy_change : int 0 0 0 0 0 0 0 0 0 0 ...
## $ academy_orientation : int 1 1 1 1 1 1 1 1 1 1 ...
## $ admission_semester : int 202020 202010 202010 202010 202010 202020 202010 202010 202020 202020 ...
## $ academic_level : int 5 5 5 5 5 2 2 5 2 2 ...
## $ aplication_year : int 2017 2017 2017 2018 2018 2019 2019 2018 2018 2019 ...
## $ aplication_month : int 4 7 10 1 6 8 8 6 10 1 ...
## $ faculty_admission : int 16 18 19 14 10 19 12 19 13 10 ...
## $ school : int 57 45 8 98 32 10 50 16 56 21 ...
## $ degree_admission : int 23 999 23 53 34 9 9 46 9 999 ...
## $ college : int 252 999 95 999 298 243 459 243 225 362 ...
## $ college_city : int 55 58 55 81 55 55 55 55 43 32 ...
## $ college_province : int 20 20 20 4 20 20 20 20 11 23 ...
## $ college_award : int 999 999 999 999 999 999 999 999 999 999 ...
## $ college_note : num 20 18 18 19 16 17 17 17 17 17 ...
## $ admission_exam_year : int 2017 2017 2017 2018 2018 2019 2019 2018 2018 2019 ...
## $ admission_exam_day : int 1 1 1 1 1 1 1 1 1 1 ...
## $ math_exam : int 590 458 581 707 493 631 557 654 654 758 ...
## $ writing_exam : int 587 496 609 669 527 676 571 629 534 556 ...
## $ verbal_exam : int 521 508 638 719 513 658 603 492 581 625 ...
## $ total_exam : int 1698 1462 1828 2095 1533 1965 1731 1755 1769 1939 ...
## $ max_payment_year : int 2018 2017 2020 2020 2019 2020 2020 2019 2020 2019 ...
## $ min_payment_year : int 2020 2018 2020 2020 2019 2020 2020 2019 2020 2020 ...
## $ num_transaction : int 3 2 3 5 5 1 5 4 3 4 ...
## $ payment_value : int 71357 119615 1915 5867 21672 2925 1224 44702 4792 14600 ...
## $ scholarship : int 0 1 0 0 0 0 0 1 0 0 ...
## $ financing_plan : int 1 1 0 0 1 0 0 1 0 1 ...
## $ scholarshiprefund : int 0 1 0 0 0 0 0 1 0 0 ...
## $ semester_age : int 27 23 26 22 20 20 20 20 21 21 ...
## $ final_note : num 3.33 1.11 3.4 3.85 3.3 ...
## $ semester_number : int 3 2 3 5 5 1 5 4 3 4 ...
## $ mandatory_subject : int 300 256 207 247 44 241 246 148 220 213 ...
## $ selective_subject : int 17 22 196 75 166 132 163 56 135 45 ...
## $ total_credits : int 91 67 159 107 12 154 90 31 99 193 ...
## $ number_mandatory_subject : int 999 999 999 999 999 999 999 999 999 999 ...
## $ number_elective_subject : int 999 999 999 999 999 999 999 999 999 999 ...
## $ number_selective_subject : int 3 7 4 10 5 2 5 3 5 5 ...
## $ last_semester : int 202120 202020 202120 202120 202120 202020 202120 202120 202120 202120 ...
## $ student_dropout : int 0 1 0 0 0 1 0 0 0 0 ...
## $ target : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 1 ...
summary(data)
## min_periodo max_periodo can_periodo gender
## Min. :202010 Min. :202010 Min. :1.000 Min. :1.000
## 1st Qu.:202010 1st Qu.:202120 1st Qu.:2.000 1st Qu.:1.000
## Median :202020 Median :202120 Median :3.000 Median :2.000
## Mean :202051 Mean :202113 Mean :3.046 Mean :1.507
## 3rd Qu.:202110 3rd Qu.:202120 3rd Qu.:4.000 3rd Qu.:2.000
## Max. :202120 Max. :202120 Max. :5.000 Max. :3.000
## birth_date birth_month start_age ethnicity
## Min. :1968 Min. : 1.000 Min. : 0.00 Min. :1.000
## 1st Qu.:2001 1st Qu.: 3.250 1st Qu.:18.00 1st Qu.:3.000
## Median :2002 Median : 6.000 Median :19.00 Median :4.000
## Mean :2001 Mean : 6.445 Mean :18.74 Mean :4.035
## 3rd Qu.:2003 3rd Qu.: 9.000 3rd Qu.:19.00 3rd Qu.:6.000
## Max. :2021 Max. :12.000 Max. :28.00 Max. :7.000
## marital_status citizenship status_academy start_semester
## Min. :2.000 Min. :1.000 Min. : 8.00 Min. :202010
## 1st Qu.:6.000 1st Qu.:1.000 1st Qu.:16.00 1st Qu.:202010
## Median :6.000 Median :1.000 Median :16.00 Median :202020
## Mean :5.944 Mean :1.044 Mean :16.03 Mean :202051
## 3rd Qu.:6.000 3rd Qu.:1.000 3rd Qu.:16.00 3rd Qu.:202110
## Max. :8.000 Max. :4.000 Max. :32.00 Max. :202120
## last_activity degree faculty num_career num_subject
## Min. :202010 Min. : 1.00 Min. :10.00 Min. :1.00 Min. :999
## 1st Qu.:202120 1st Qu.:13.00 1st Qu.:12.00 1st Qu.:1.00 1st Qu.:999
## Median :202120 Median :27.00 Median :16.00 Median :1.00 Median :999
## Mean :202113 Mean :24.75 Mean :16.36 Mean :1.07 Mean :999
## 3rd Qu.:202120 3rd Qu.:34.00 3rd Qu.:18.00 3rd Qu.:1.00 3rd Qu.:999
## Max. :202120 Max. :99.00 Max. :29.00 Max. :3.00 Max. :999
## max_note_semester min_note_semester number_payment last_payment_year
## Min. : 0.0 Min. : 0.0 Min. : 0.000 Min. : 999
## 1st Qu.: 3.0 1st Qu.: 3.0 1st Qu.: 0.000 1st Qu.: 999
## Median : 4.0 Median : 4.0 Median : 0.000 Median : 999
## Mean :293.5 Mean :293.4 Mean : 2.211 Mean :1090
## 3rd Qu.:999.0 3rd Qu.:999.0 3rd Qu.: 0.000 3rd Qu.: 999
## Max. :999.0 Max. :999.0 Max. :116.000 Max. :2020
## note_last_semester school_preparation_problem academy_performance_problem
## Min. : 0.0 Min. :2 Min. : 0.000
## 1st Qu.: 0.0 1st Qu.:2 1st Qu.: 0.000
## Median : 4.0 Median :2 Median : 0.000
## Mean :354.1 Mean :2 Mean : 7.446
## 3rd Qu.:999.0 3rd Qu.:2 3rd Qu.: 0.000
## Max. :999.0 Max. :2 Max. :999.000
## academy_change academy_orientation admission_semester academic_level
## Min. :0 Min. :1 Min. :202010 Min. :1.000
## 1st Qu.:0 1st Qu.:1 1st Qu.:202010 1st Qu.:5.000
## Median :0 Median :1 Median :202020 Median :5.000
## Mean :0 Mean :1 Mean :202051 Mean :4.839
## 3rd Qu.:0 3rd Qu.:1 3rd Qu.:202110 3rd Qu.:5.000
## Max. :0 Max. :1 Max. :202120 Max. :5.000
## aplication_year aplication_month faculty_admission school
## Min. :2017 Min. : 1.000 Min. :10.00 Min. : 4.00
## 1st Qu.:2020 1st Qu.: 2.000 1st Qu.:12.00 1st Qu.: 28.00
## Median :2020 Median : 5.000 Median :16.00 Median : 43.00
## Mean :2020 Mean : 5.785 Mean :16.36 Mean : 55.21
## 3rd Qu.:2020 3rd Qu.:10.000 3rd Qu.:18.00 3rd Qu.: 51.00
## Max. :2021 Max. :12.000 Max. :29.00 Max. :999.00
## degree_admission college college_city college_province
## Min. : 2.00 Min. : 3 Min. : 1.0 Min. : 1.00
## 1st Qu.: 15.00 1st Qu.:158 1st Qu.: 55.0 1st Qu.: 20.00
## Median : 27.00 Median :288 Median : 55.0 Median : 20.00
## Mean : 68.67 Mean :362 Mean :112.2 Mean : 73.33
## 3rd Qu.: 46.00 3rd Qu.:459 3rd Qu.: 55.0 3rd Qu.: 20.00
## Max. :999.00 Max. :999 Max. :999.0 Max. :999.00
## college_award college_note admission_exam_year admission_exam_day
## Min. : 2.0 Min. : 7.20 Min. :2017 Min. :1
## 1st Qu.:999.0 1st Qu.:16.00 1st Qu.:2020 1st Qu.:1
## Median :999.0 Median :17.00 Median :2020 Median :1
## Mean :994.2 Mean :17.46 Mean :2020 Mean :1
## 3rd Qu.:999.0 3rd Qu.:19.00 3rd Qu.:2020 3rd Qu.:1
## Max. :999.0 Max. :20.00 Max. :2021 Max. :1
## math_exam writing_exam verbal_exam total_exam
## Min. : 334.0 Min. : 329.0 Min. : 323.0 Min. :1046
## 1st Qu.: 621.0 1st Qu.: 582.0 1st Qu.: 590.0 1st Qu.:1916
## Median : 708.0 Median : 669.0 Median : 670.0 Median :2091
## Mean : 718.2 Mean : 697.1 Mean : 696.1 Mean :2110
## 3rd Qu.: 809.0 3rd Qu.: 799.0 3rd Qu.: 787.0 3rd Qu.:2307
## Max. :1000.0 Max. :1000.0 Max. :1000.0 Max. :2931
## max_payment_year min_payment_year num_transaction payment_value
## Min. :2017 Min. :2018 Min. :1.000 Min. : 1
## 1st Qu.:2020 1st Qu.:2020 1st Qu.:2.000 1st Qu.: 1667
## Median :2020 Median :2020 Median :3.000 Median : 3228
## Mean :2020 Mean :2020 Mean :3.046 Mean : 3742
## 3rd Qu.:2021 3rd Qu.:2021 3rd Qu.:4.000 3rd Qu.: 4855
## Max. :2021 Max. :2021 Max. :5.000 Max. :119615
## scholarship financing_plan scholarshiprefund semester_age
## Min. :0.00000 Min. :0.00000 Min. :0.00000 Min. :13.00
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:18.00
## Median :0.00000 Median :0.00000 Median :0.00000 Median :19.00
## Mean :0.01326 Mean :0.08444 Mean :0.00977 Mean :19.43
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:20.00
## Max. :1.00000 Max. :1.00000 Max. :1.00000 Max. :56.00
## final_note semester_number mandatory_subject selective_subject
## Min. : 0.000 Min. :1.000 Min. : 0.00 Min. : 0.0
## 1st Qu.: 3.138 1st Qu.:2.000 1st Qu.: 77.25 1st Qu.: 52.0
## Median : 3.750 Median :3.000 Median :150.00 Median :101.0
## Mean :293.455 Mean :3.053 Mean :150.53 Mean :101.4
## 3rd Qu.:999.000 3rd Qu.:4.000 3rd Qu.:222.75 3rd Qu.:153.0
## Max. :999.000 Max. :5.000 Max. :300.00 Max. :200.0
## total_credits number_mandatory_subject number_elective_subject
## Min. : 0.0 Min. :999 Min. :999
## 1st Qu.: 53.0 1st Qu.:999 1st Qu.:999
## Median :101.0 Median :999 Median :999
## Mean :100.5 Mean :999 Mean :999
## 3rd Qu.:149.0 3rd Qu.:999 3rd Qu.:999
## Max. :200.0 Max. :999 Max. :999
## number_selective_subject last_semester student_dropout target
## Min. : 0.000 Min. :202010 Min. :0.00000 0:2735
## 1st Qu.: 2.000 1st Qu.:202120 1st Qu.:0.00000 1: 131
## Median : 5.000 Median :202120 Median :0.00000
## Mean : 4.997 Mean :202113 Mean :0.04571
## 3rd Qu.: 8.000 3rd Qu.:202120 3rd Qu.:0.00000
## Max. :10.000 Max. :202120 Max. :1.00000
# Deskripsi numerik
describe(data[, sapply(data, is.numeric)])
## vars n mean sd median trimmed
## min_periodo 1 2866 202051.07 47.71 202020.00 202048.65
## max_periodo 2 2866 202112.61 24.19 202120.00 202118.89
## can_periodo 3 2866 3.05 1.35 3.00 3.06
## gender 4 2866 1.51 0.50 2.00 1.51
## birth_date 5 2866 2001.37 3.40 2002.00 2002.03
## birth_month 6 2866 6.45 3.47 6.00 6.45
## start_age 7 2866 18.74 1.47 19.00 18.69
## ethnicity 8 2866 4.04 1.79 4.00 4.04
## marital_status 9 2866 5.94 0.46 6.00 6.00
## citizenship 10 2866 1.04 0.30 1.00 1.00
## status_academy 11 2866 16.03 2.28 16.00 16.00
## start_semester 12 2866 202051.07 47.71 202020.00 202048.65
## last_activity 13 2866 202112.91 23.72 202120.00 202118.98
## degree 14 2866 24.75 14.07 27.00 24.88
## faculty 15 2866 16.36 5.73 16.00 15.66
## num_career 16 2866 1.07 0.27 1.00 1.00
## num_subject 17 2866 999.00 0.00 999.00 999.00
## max_note_semester 18 2866 293.54 452.80 4.00 242.19
## min_note_semester 19 2866 293.45 452.86 4.00 242.07
## number_payment 20 2866 2.21 8.96 0.00 0.00
## last_payment_year 21 2866 1090.18 291.18 999.00 999.00
## note_last_semester 22 2866 354.14 476.88 4.00 317.89
## school_preparation_problem 23 2866 2.00 0.00 2.00 2.00
## academy_performance_problem 24 2866 7.45 85.20 0.00 0.04
## academy_change 25 2866 0.00 0.00 0.00 0.00
## academy_orientation 26 2866 1.00 0.00 1.00 1.00
## admission_semester 27 2866 202051.07 47.71 202020.00 202048.65
## academic_level 28 2866 4.84 0.68 5.00 5.00
## aplication_year 29 2866 2020.06 0.68 2020.00 2020.09
## aplication_month 30 2866 5.79 3.78 5.00 5.62
## faculty_admission 31 2866 16.36 5.73 16.00 15.66
## school 32 2866 55.21 109.32 43.00 41.13
## degree_admission 33 2866 68.67 195.29 27.00 29.11
## college 34 2866 362.05 286.37 288.00 322.74
## college_city 35 2866 112.24 236.84 55.00 52.84
## college_province 36 2866 73.33 225.91 20.00 19.38
## college_award 37 2866 994.16 69.06 999.00 999.00
## college_note 38 2866 17.46 1.57 17.00 17.46
## admission_exam_year 39 2866 2020.06 0.68 2020.00 2020.09
## admission_exam_day 40 2866 1.00 0.00 1.00 1.00
## math_exam 41 2866 718.18 133.55 708.00 714.73
## writing_exam 42 2866 697.08 140.96 669.00 688.33
## verbal_exam 43 2866 696.13 137.87 670.00 687.55
## total_exam 44 2866 2109.98 292.35 2091.00 2106.48
## max_payment_year 45 2866 2020.36 0.55 2020.00 2020.36
## min_payment_year 46 2866 2020.37 0.53 2020.00 2020.36
## num_transaction 47 2866 3.05 1.35 3.00 3.06
## payment_value 48 2866 3741.75 4250.82 3227.50 3257.55
## scholarship 49 2866 0.01 0.11 0.00 0.00
## financing_plan 50 2866 0.08 0.28 0.00 0.00
## scholarshiprefund 51 2866 0.01 0.10 0.00 0.00
## semester_age 52 2866 19.43 2.16 19.00 19.04
## final_note 53 2866 293.45 452.85 3.75 242.08
## semester_number 54 2866 3.05 1.35 3.00 3.06
## mandatory_subject 55 2866 150.53 85.39 150.00 150.58
## selective_subject 56 2866 101.39 57.25 101.00 101.53
## total_credits 57 2866 100.51 57.42 101.00 100.64
## number_mandatory_subject 58 2866 999.00 0.00 999.00 999.00
## number_elective_subject 59 2866 999.00 0.00 999.00 999.00
## number_selective_subject 60 2866 5.00 2.97 5.00 4.99
## last_semester 61 2866 202112.91 23.72 202120.00 202118.98
## student_dropout 62 2866 0.05 0.21 0.00 0.00
## mad min max range skew kurtosis
## min_periodo 14.83 202010.0 202120 110.0 0.43 -1.79
## max_periodo 0.00 202010.0 202120 110.0 -3.63 11.51
## can_periodo 1.48 1.0 5 4.0 0.18 -1.38
## gender 0.00 1.0 3 2.0 -0.02 -1.98
## birth_date 1.48 1968.0 2021 53.0 -3.48 22.45
## birth_month 4.45 1.0 12 11.0 0.02 -1.20
## start_age 1.48 0.0 28 28.0 -2.78 47.90
## ethnicity 2.97 1.0 7 6.0 -0.03 -1.11
## marital_status 0.00 2.0 8 6.0 -7.77 62.39
## citizenship 0.00 1.0 4 3.0 6.57 41.57
## status_academy 0.00 8.0 32 24.0 2.67 26.28
## start_semester 14.83 202010.0 202120 110.0 0.43 -1.79
## last_activity 0.00 202010.0 202120 110.0 -3.72 12.19
## degree 13.34 1.0 99 98.0 0.17 0.61
## faculty 4.45 10.0 29 19.0 0.92 -0.09
## num_career 0.00 1.0 3 2.0 3.79 14.23
## num_subject 0.00 999.0 999 0.0 NaN NaN
## max_note_semester 1.48 0.0 999 999.0 0.92 -1.16
## min_note_semester 1.48 0.0 999 999.0 0.92 -1.16
## number_payment 0.00 0.0 116 116.0 5.73 41.12
## last_payment_year 0.00 999.0 2020 1021.0 2.88 6.29
## note_last_semester 5.93 0.0 999 999.0 0.61 -1.63
## school_preparation_problem 0.00 2.0 2 0.0 NaN NaN
## academy_performance_problem 0.00 0.0 999 999.0 11.55 131.39
## academy_change 0.00 0.0 0 0.0 NaN NaN
## academy_orientation 0.00 1.0 1 0.0 NaN NaN
## admission_semester 14.83 202010.0 202120 110.0 0.43 -1.79
## academic_level 0.00 1.0 5 4.0 -4.01 14.19
## aplication_year 0.00 2017.0 2021 4.0 -0.35 0.23
## aplication_month 4.45 1.0 12 11.0 0.28 -1.34
## faculty_admission 4.45 10.0 29 19.0 0.92 -0.09
## school 19.27 4.0 999 995.0 8.08 66.79
## degree_admission 23.72 2.0 999 997.0 4.51 18.57
## college 225.36 3.0 999 996.0 1.20 0.55
## college_city 0.00 1.0 999 998.0 3.46 10.04
## college_province 0.00 1.0 999 998.0 3.85 12.84
## college_award 0.00 2.0 999 997.0 -14.20 199.61
## college_note 1.48 7.2 20 12.8 -0.38 1.19
## admission_exam_year 0.00 2017.0 2021 4.0 -0.35 0.23
## admission_exam_day 0.00 1.0 1 0.0 NaN NaN
## math_exam 138.62 334.0 1000 666.0 0.19 -0.69
## writing_exam 145.29 329.0 1000 671.0 0.49 -0.74
## verbal_exam 139.36 323.0 1000 677.0 0.48 -0.67
## total_exam 289.11 1046.0 2931 1885.0 0.10 -0.23
## max_payment_year 0.00 2017.0 2021 4.0 -0.13 -0.25
## min_payment_year 0.00 2018.0 2021 3.0 0.06 -0.92
## num_transaction 1.48 1.0 5 4.0 0.18 -1.38
## payment_value 2372.90 1.0 119615 119614.0 10.47 222.91
## scholarship 0.00 0.0 1 1.0 8.51 70.38
## financing_plan 0.00 0.0 1 1.0 2.99 6.93
## scholarshiprefund 0.00 0.0 1 1.0 9.96 97.30
## semester_age 1.48 13.0 56 43.0 5.24 54.53
## final_note 1.25 0.0 999 999.0 0.92 -1.16
## semester_number 1.48 1.0 5 4.0 0.17 -1.39
## mandatory_subject 108.23 0.0 300 300.0 0.00 -1.17
## selective_subject 74.13 0.0 200 200.0 0.00 -1.20
## total_credits 71.16 0.0 200 200.0 -0.01 -1.17
## number_mandatory_subject 0.00 999.0 999 0.0 NaN NaN
## number_elective_subject 0.00 999.0 999 0.0 NaN NaN
## number_selective_subject 4.45 0.0 10 10.0 0.02 -1.21
## last_semester 0.00 202010.0 202120 110.0 -3.72 12.19
## student_dropout 0.00 0.0 1 1.0 4.35 16.91
## se
## min_periodo 0.89
## max_periodo 0.45
## can_periodo 0.03
## gender 0.01
## birth_date 0.06
## birth_month 0.06
## start_age 0.03
## ethnicity 0.03
## marital_status 0.01
## citizenship 0.01
## status_academy 0.04
## start_semester 0.89
## last_activity 0.44
## degree 0.26
## faculty 0.11
## num_career 0.00
## num_subject 0.00
## max_note_semester 8.46
## min_note_semester 8.46
## number_payment 0.17
## last_payment_year 5.44
## note_last_semester 8.91
## school_preparation_problem 0.00
## academy_performance_problem 1.59
## academy_change 0.00
## academy_orientation 0.00
## admission_semester 0.89
## academic_level 0.01
## aplication_year 0.01
## aplication_month 0.07
## faculty_admission 0.11
## school 2.04
## degree_admission 3.65
## college 5.35
## college_city 4.42
## college_province 4.22
## college_award 1.29
## college_note 0.03
## admission_exam_year 0.01
## admission_exam_day 0.00
## math_exam 2.49
## writing_exam 2.63
## verbal_exam 2.58
## total_exam 5.46
## max_payment_year 0.01
## min_payment_year 0.01
## num_transaction 0.03
## payment_value 79.40
## scholarship 0.00
## financing_plan 0.01
## scholarshiprefund 0.00
## semester_age 0.04
## final_note 8.46
## semester_number 0.03
## mandatory_subject 1.60
## selective_subject 1.07
## total_credits 1.07
## number_mandatory_subject 0.00
## number_elective_subject 0.00
## number_selective_subject 0.06
## last_semester 0.44
## student_dropout 0.00
# Korelasi
num_cols <- sapply(data, is.numeric)
# Identifikasi kolom dengan SD nol
sd_zero_cols <- names(data)[num_cols][sapply(data[, num_cols], function(x) sd(x, na.rm = TRUE) == 0)]
# Buang variabel dengan SD nol
data <- data[, !(names(data) %in% sd_zero_cols)]
# Korelasi setelah pembersihan
corr_matrix <- cor(data[, sapply(data, is.numeric)], use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.6)
# Variabel terpilih
selected_vars <- c("final_note", "semester_number", "total_credits", "math_exam", "verbal_exam", "payment_value")
# Split data
set.seed(123)
split <- sample.split(data$target, SplitRatio = 0.7)
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)
# Model regresi logistik
formula_log <- as.formula(paste("target ~", paste(selected_vars, collapse = " + ")))
log_model <- glm(formula_log, data = train, family = "binomial")
summary(log_model)
##
## Call:
## glm(formula = formula_log, family = "binomial", data = train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.635e+00 9.641e-01 2.733 0.00627 **
## final_note -8.430e-03 4.518e-03 -1.866 0.06204 .
## semester_number -2.175e+00 2.075e-01 -10.485 < 2e-16 ***
## total_credits -1.406e-03 2.374e-03 -0.592 0.55366
## math_exam 8.743e-04 1.002e-03 0.872 0.38307
## verbal_exam -1.144e-03 1.027e-03 -1.114 0.26516
## payment_value 4.967e-05 2.407e-05 2.063 0.03908 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 746.82 on 2005 degrees of freedom
## Residual deviance: 381.63 on 1999 degrees of freedom
## AIC: 395.63
##
## Number of Fisher Scoring iterations: 11
# VIF (multikolienaritas)
vif(log_model)
## final_note semester_number total_credits math_exam verbal_exam
## 1.000457 1.065900 1.027318 1.092876 1.078042
## payment_value
## 1.041177
# Influence plot
influencePlot(log_model, id.method = "identify", main = "Influence Plot", sub = "Circle size is proportional to Cook's distance")
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter
## StudRes Hat CookD
## 1 -1.2952899 0.585845370 0.2725643505
## 2 0.2041173 0.141990019 0.0005348591
## 1442 2.9907334 0.001940802 0.0222671779
## 1498 2.8970107 0.003475307 0.0294724274
# QQ Plot residual deviance
resid_log <- residuals(log_model, type = "deviance")
qqnorm(resid_log)
qqline(resid_log, col = "red")
# Hosmer-Lemeshow test
hoslem.test(as.numeric(train$target) - 1, fitted(log_model))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: as.numeric(train$target) - 1, fitted(log_model)
## X-squared = 1.0069, df = 8, p-value = 0.9982
# Prediksi dan confusion matrix (logistik)
log_prob <- predict(log_model, newdata = test, type = "response")
log_class <- ifelse(log_prob > 0.5, 1, 0) %>% as.factor()
confusionMatrix(log_class, test$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 798 16
## 1 23 23
##
## Accuracy : 0.9547
## 95% CI : (0.9385, 0.9676)
## No Information Rate : 0.9547
## P-Value [Acc > NIR] : 0.5425
##
## Kappa : 0.5175
##
## Mcnemar's Test P-Value : 0.3367
##
## Sensitivity : 0.9720
## Specificity : 0.5897
## Pos Pred Value : 0.9803
## Neg Pred Value : 0.5000
## Prevalence : 0.9547
## Detection Rate : 0.9279
## Detection Prevalence : 0.9465
## Balanced Accuracy : 0.7809
##
## 'Positive' Class : 0
##
# LDA
lda_formula <- as.formula(paste("target ~", paste(selected_vars, collapse = " + ")))
lda_model <- lda(lda_formula, data = train)
lda_pred <- predict(lda_model, test)
confusionMatrix(lda_pred$class, test$target)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 792 15
## 1 29 24
##
## Accuracy : 0.9488
## 95% CI : (0.9319, 0.9626)
## No Information Rate : 0.9547
## P-Value [Acc > NIR] : 0.81769
##
## Kappa : 0.4954
##
## Mcnemar's Test P-Value : 0.05002
##
## Sensitivity : 0.9647
## Specificity : 0.6154
## Pos Pred Value : 0.9814
## Neg Pred Value : 0.4528
## Prevalence : 0.9547
## Detection Rate : 0.9209
## Detection Prevalence : 0.9384
## Balanced Accuracy : 0.7900
##
## 'Positive' Class : 0
##
# Uji normalitas univariat (Shapiro-Wilk) per kelas
cat("Uji normalitas univariat (Shapiro-Wilk) per kelas:\n")
## Uji normalitas univariat (Shapiro-Wilk) per kelas:
for (v in selected_vars) {
for (g in levels(train$target)) {
x <- train[train$target == g, v]
if (length(unique(x)) > 1) {
sw <- shapiro.test(x)
cat(sprintf("Variabel: %s, Group: %s → p-value: %.4f\n", v, g, sw$p.value))
}
}
}
## Variabel: final_note, Group: 0 → p-value: 0.0000
## Variabel: final_note, Group: 1 → p-value: 0.0000
## Variabel: semester_number, Group: 0 → p-value: 0.0000
## Variabel: semester_number, Group: 1 → p-value: 0.0000
## Variabel: total_credits, Group: 0 → p-value: 0.0000
## Variabel: total_credits, Group: 1 → p-value: 0.0559
## Variabel: math_exam, Group: 0 → p-value: 0.0000
## Variabel: math_exam, Group: 1 → p-value: 0.2022
## Variabel: verbal_exam, Group: 0 → p-value: 0.0000
## Variabel: verbal_exam, Group: 1 → p-value: 0.0037
## Variabel: payment_value, Group: 0 → p-value: 0.0000
## Variabel: payment_value, Group: 1 → p-value: 0.0000
# Uji normalitas multivariat (Mardia)
if (!require(MVN)) install.packages("MVN")
## Loading required package: MVN
library(MVN)
cat("\nUji Mardia untuk multivariat normalitas:\n")
##
## Uji Mardia untuk multivariat normalitas:
mvn_result <- mvn(data = train[, selected_vars], mvnTest = "mardia")
print(mvn_result$multivariateNormality)
## Test Statistic p value Result
## 1 Mardia Skewness 49251.8740969285 0 NO
## 2 Mardia Kurtosis 574.496723683397 0 NO
## 3 MVN <NA> <NA> NO
# Uji Box's M
cat("\nUji Box's M untuk kesamaan matriks kovarians:\n")
##
## Uji Box's M untuk kesamaan matriks kovarians:
boxm <- boxM(train[, selected_vars], train$target)
print(boxm)
##
## Box's M-test for Homogeneity of Covariance Matrices
##
## data: train[, selected_vars]
## Chi-Sq (approx.) = 1537.4, df = 21, p-value < 2.2e-16
# Korelasi antar variabel prediktor
cat("\nKorelasi antar variabel prediktor:\n")
##
## Korelasi antar variabel prediktor:
corr_matrix_lda <- cor(train[, selected_vars])
print(round(corr_matrix_lda, 2))
## final_note semester_number total_credits math_exam verbal_exam
## final_note 1.00 -0.60 -0.03 0.13 0.13
## semester_number -0.60 1.00 -0.01 -0.10 -0.14
## total_credits -0.03 -0.01 1.00 -0.04 -0.01
## math_exam 0.13 -0.10 -0.04 1.00 0.24
## verbal_exam 0.13 -0.14 -0.01 0.24 1.00
## payment_value -0.11 0.10 -0.04 -0.09 -0.09
## payment_value
## final_note -0.11
## semester_number 0.10
## total_credits -0.04
## math_exam -0.09
## verbal_exam -0.09
## payment_value 1.00
corrplot(corr_matrix_lda, method = "number", type = "upper", tl.cex = 0.7)
# Visualisasi
corrplot(cor(train[, selected_vars], use = "complete.obs"), method = "number", type = "upper", tl.cex = 0.7)
ggplot(data, aes(x = target, y = final_note, fill = target)) +
geom_boxplot() +
labs(title = "Boxplot Final Note berdasarkan Status Dropout",
x = "Status Dropout", y = "Final Note") +
theme_minimal()
# ROC dan AUC
roc_obj <- roc(as.numeric(test$target) - 1, log_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Regresi Logistik")
abline(a = 0, b = 1, lty = 2, col = "gray")
cat("AUC:", auc(roc_obj), "\n")
## AUC: 0.9369437
# Confusion Matrix Plot
cm <- confusionMatrix(log_class, test$target)
cm_df <- as.data.frame(cm$table)
ggplot(cm_df, aes(Prediction, Reference, fill = Freq)) +
geom_tile(color = "white") +
geom_text(aes(label = Freq), size = 5) +
scale_fill_gradient(low = "white", high = "steelblue") +
labs(title = "Confusion Matrix - Regresi Logistik", x = "Prediksi", y = "Kenyataan") +
theme_minimal()
You can also embed plots, for example:
Note that the echo = FALSE parameter was added to the
code chunk to prevent printing of the R code that generated the
plot.