R Markdown

This is an R Markdown document. Markdown is a simple formatting syntax for authoring HTML, PDF, and MS Word documents. For more details on using R Markdown see http://rmarkdown.rstudio.com.

When you click the Knit button a document will be generated that includes both content as well as the output of any embedded R code chunks within the document. You can embed an R code chunk like this:

# Library
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift
library(e1071)
library(MASS)
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
library(caTools)
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(lmtest)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(corrplot)
## corrplot 0.95 loaded
library(biotools)
## ---
## biotools version 4.3
library(ResourceSelection)
## ResourceSelection 0.3-6   2023-06-27
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
# Load Data
data <- read.csv("Poblacion.csv", sep = ";", header = TRUE)
colnames(data) <- make.names(colnames(data))
data$target <- as.factor(data$student_dropout)

# Jumlah missing value per kolom
cat("Jumlah missing value per kolom:\n")
## Jumlah missing value per kolom:
missing_counts <- colSums(is.na(data))
print(missing_counts[missing_counts > 0])
## named numeric(0)
# Total missing value di seluruh dataset
cat("\nTotal missing value di seluruh dataset:", sum(is.na(data)), "\n")
## 
## Total missing value di seluruh dataset: 0
# Cek baris duplikat
cat("\nJumlah baris duplikat:\n")
## 
## Jumlah baris duplikat:
duplicate_rows <- duplicated(data)
cat("Total baris duplikat:", sum(duplicate_rows), "\n")
## Total baris duplikat: 0
# Struktur dan ringkasan
str(data)
## 'data.frame':    2866 obs. of  63 variables:
##  $ min_periodo                : int  202020 202010 202010 202010 202010 202020 202010 202010 202020 202020 ...
##  $ max_periodo                : int  202120 202020 202120 202120 202120 202020 202120 202120 202120 202120 ...
##  $ can_periodo                : int  3 2 3 5 5 1 5 4 3 4 ...
##  $ gender                     : int  1 1 2 1 1 2 1 1 1 1 ...
##  $ birth_date                 : int  1999 1999 2000 1992 2000 1988 1988 1996 1989 1991 ...
##  $ birth_month                : int  4 1 8 9 3 12 4 11 11 2 ...
##  $ start_age                  : int  22 22 21 22 20 20 20 20 21 21 ...
##  $ ethnicity                  : int  3 5 1 3 2 3 4 2 4 4 ...
##  $ marital_status             : int  6 6 6 6 6 6 4 6 6 4 ...
##  $ citizenship                : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ status_academy             : int  16 8 16 16 16 32 16 16 16 16 ...
##  $ start_semester             : int  202020 202010 202010 202010 202010 202020 202010 202010 202020 202020 ...
##  $ last_activity              : int  202120 202020 202120 202120 202120 202020 202120 202120 202120 202120 ...
##  $ degree                     : int  26 45 8 34 32 10 9 16 12 21 ...
##  $ faculty                    : int  16 18 19 14 10 19 12 19 13 10 ...
##  $ num_career                 : int  1 2 2 1 1 1 1 1 2 2 ...
##  $ num_subject                : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ max_note_semester          : int  3 2 3 4 3 4 4 4 3 4 ...
##  $ min_note_semester          : int  3 1 3 4 3 4 3 4 3 4 ...
##  $ number_payment             : int  102 116 0 0 30 0 0 104 0 24 ...
##  $ last_payment_year          : int  2020 2019 999 999 2019 999 999 2019 999 2020 ...
##  $ note_last_semester         : int  3 0 4 4 0 4 3 0 0 0 ...
##  $ school_preparation_problem : int  2 2 2 2 2 2 2 2 2 2 ...
##  $ academy_performance_problem: int  0 1 0 0 0 0 0 0 0 0 ...
##  $ academy_change             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ academy_orientation        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ admission_semester         : int  202020 202010 202010 202010 202010 202020 202010 202010 202020 202020 ...
##  $ academic_level             : int  5 5 5 5 5 2 2 5 2 2 ...
##  $ aplication_year            : int  2017 2017 2017 2018 2018 2019 2019 2018 2018 2019 ...
##  $ aplication_month           : int  4 7 10 1 6 8 8 6 10 1 ...
##  $ faculty_admission          : int  16 18 19 14 10 19 12 19 13 10 ...
##  $ school                     : int  57 45 8 98 32 10 50 16 56 21 ...
##  $ degree_admission           : int  23 999 23 53 34 9 9 46 9 999 ...
##  $ college                    : int  252 999 95 999 298 243 459 243 225 362 ...
##  $ college_city               : int  55 58 55 81 55 55 55 55 43 32 ...
##  $ college_province           : int  20 20 20 4 20 20 20 20 11 23 ...
##  $ college_award              : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ college_note               : num  20 18 18 19 16 17 17 17 17 17 ...
##  $ admission_exam_year        : int  2017 2017 2017 2018 2018 2019 2019 2018 2018 2019 ...
##  $ admission_exam_day         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ math_exam                  : int  590 458 581 707 493 631 557 654 654 758 ...
##  $ writing_exam               : int  587 496 609 669 527 676 571 629 534 556 ...
##  $ verbal_exam                : int  521 508 638 719 513 658 603 492 581 625 ...
##  $ total_exam                 : int  1698 1462 1828 2095 1533 1965 1731 1755 1769 1939 ...
##  $ max_payment_year           : int  2018 2017 2020 2020 2019 2020 2020 2019 2020 2019 ...
##  $ min_payment_year           : int  2020 2018 2020 2020 2019 2020 2020 2019 2020 2020 ...
##  $ num_transaction            : int  3 2 3 5 5 1 5 4 3 4 ...
##  $ payment_value              : int  71357 119615 1915 5867 21672 2925 1224 44702 4792 14600 ...
##  $ scholarship                : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ financing_plan             : int  1 1 0 0 1 0 0 1 0 1 ...
##  $ scholarshiprefund          : int  0 1 0 0 0 0 0 1 0 0 ...
##  $ semester_age               : int  27 23 26 22 20 20 20 20 21 21 ...
##  $ final_note                 : num  3.33 1.11 3.4 3.85 3.3 ...
##  $ semester_number            : int  3 2 3 5 5 1 5 4 3 4 ...
##  $ mandatory_subject          : int  300 256 207 247 44 241 246 148 220 213 ...
##  $ selective_subject          : int  17 22 196 75 166 132 163 56 135 45 ...
##  $ total_credits              : int  91 67 159 107 12 154 90 31 99 193 ...
##  $ number_mandatory_subject   : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ number_elective_subject    : int  999 999 999 999 999 999 999 999 999 999 ...
##  $ number_selective_subject   : int  3 7 4 10 5 2 5 3 5 5 ...
##  $ last_semester              : int  202120 202020 202120 202120 202120 202020 202120 202120 202120 202120 ...
##  $ student_dropout            : int  0 1 0 0 0 1 0 0 0 0 ...
##  $ target                     : Factor w/ 2 levels "0","1": 1 2 1 1 1 2 1 1 1 1 ...
summary(data)
##   min_periodo      max_periodo      can_periodo        gender     
##  Min.   :202010   Min.   :202010   Min.   :1.000   Min.   :1.000  
##  1st Qu.:202010   1st Qu.:202120   1st Qu.:2.000   1st Qu.:1.000  
##  Median :202020   Median :202120   Median :3.000   Median :2.000  
##  Mean   :202051   Mean   :202113   Mean   :3.046   Mean   :1.507  
##  3rd Qu.:202110   3rd Qu.:202120   3rd Qu.:4.000   3rd Qu.:2.000  
##  Max.   :202120   Max.   :202120   Max.   :5.000   Max.   :3.000  
##    birth_date    birth_month       start_age       ethnicity    
##  Min.   :1968   Min.   : 1.000   Min.   : 0.00   Min.   :1.000  
##  1st Qu.:2001   1st Qu.: 3.250   1st Qu.:18.00   1st Qu.:3.000  
##  Median :2002   Median : 6.000   Median :19.00   Median :4.000  
##  Mean   :2001   Mean   : 6.445   Mean   :18.74   Mean   :4.035  
##  3rd Qu.:2003   3rd Qu.: 9.000   3rd Qu.:19.00   3rd Qu.:6.000  
##  Max.   :2021   Max.   :12.000   Max.   :28.00   Max.   :7.000  
##  marital_status   citizenship    status_academy  start_semester  
##  Min.   :2.000   Min.   :1.000   Min.   : 8.00   Min.   :202010  
##  1st Qu.:6.000   1st Qu.:1.000   1st Qu.:16.00   1st Qu.:202010  
##  Median :6.000   Median :1.000   Median :16.00   Median :202020  
##  Mean   :5.944   Mean   :1.044   Mean   :16.03   Mean   :202051  
##  3rd Qu.:6.000   3rd Qu.:1.000   3rd Qu.:16.00   3rd Qu.:202110  
##  Max.   :8.000   Max.   :4.000   Max.   :32.00   Max.   :202120  
##  last_activity        degree         faculty        num_career    num_subject 
##  Min.   :202010   Min.   : 1.00   Min.   :10.00   Min.   :1.00   Min.   :999  
##  1st Qu.:202120   1st Qu.:13.00   1st Qu.:12.00   1st Qu.:1.00   1st Qu.:999  
##  Median :202120   Median :27.00   Median :16.00   Median :1.00   Median :999  
##  Mean   :202113   Mean   :24.75   Mean   :16.36   Mean   :1.07   Mean   :999  
##  3rd Qu.:202120   3rd Qu.:34.00   3rd Qu.:18.00   3rd Qu.:1.00   3rd Qu.:999  
##  Max.   :202120   Max.   :99.00   Max.   :29.00   Max.   :3.00   Max.   :999  
##  max_note_semester min_note_semester number_payment    last_payment_year
##  Min.   :  0.0     Min.   :  0.0     Min.   :  0.000   Min.   : 999     
##  1st Qu.:  3.0     1st Qu.:  3.0     1st Qu.:  0.000   1st Qu.: 999     
##  Median :  4.0     Median :  4.0     Median :  0.000   Median : 999     
##  Mean   :293.5     Mean   :293.4     Mean   :  2.211   Mean   :1090     
##  3rd Qu.:999.0     3rd Qu.:999.0     3rd Qu.:  0.000   3rd Qu.: 999     
##  Max.   :999.0     Max.   :999.0     Max.   :116.000   Max.   :2020     
##  note_last_semester school_preparation_problem academy_performance_problem
##  Min.   :  0.0      Min.   :2                  Min.   :  0.000            
##  1st Qu.:  0.0      1st Qu.:2                  1st Qu.:  0.000            
##  Median :  4.0      Median :2                  Median :  0.000            
##  Mean   :354.1      Mean   :2                  Mean   :  7.446            
##  3rd Qu.:999.0      3rd Qu.:2                  3rd Qu.:  0.000            
##  Max.   :999.0      Max.   :2                  Max.   :999.000            
##  academy_change academy_orientation admission_semester academic_level 
##  Min.   :0      Min.   :1           Min.   :202010     Min.   :1.000  
##  1st Qu.:0      1st Qu.:1           1st Qu.:202010     1st Qu.:5.000  
##  Median :0      Median :1           Median :202020     Median :5.000  
##  Mean   :0      Mean   :1           Mean   :202051     Mean   :4.839  
##  3rd Qu.:0      3rd Qu.:1           3rd Qu.:202110     3rd Qu.:5.000  
##  Max.   :0      Max.   :1           Max.   :202120     Max.   :5.000  
##  aplication_year aplication_month faculty_admission     school      
##  Min.   :2017    Min.   : 1.000   Min.   :10.00     Min.   :  4.00  
##  1st Qu.:2020    1st Qu.: 2.000   1st Qu.:12.00     1st Qu.: 28.00  
##  Median :2020    Median : 5.000   Median :16.00     Median : 43.00  
##  Mean   :2020    Mean   : 5.785   Mean   :16.36     Mean   : 55.21  
##  3rd Qu.:2020    3rd Qu.:10.000   3rd Qu.:18.00     3rd Qu.: 51.00  
##  Max.   :2021    Max.   :12.000   Max.   :29.00     Max.   :999.00  
##  degree_admission    college     college_city   college_province
##  Min.   :  2.00   Min.   :  3   Min.   :  1.0   Min.   :  1.00  
##  1st Qu.: 15.00   1st Qu.:158   1st Qu.: 55.0   1st Qu.: 20.00  
##  Median : 27.00   Median :288   Median : 55.0   Median : 20.00  
##  Mean   : 68.67   Mean   :362   Mean   :112.2   Mean   : 73.33  
##  3rd Qu.: 46.00   3rd Qu.:459   3rd Qu.: 55.0   3rd Qu.: 20.00  
##  Max.   :999.00   Max.   :999   Max.   :999.0   Max.   :999.00  
##  college_award    college_note   admission_exam_year admission_exam_day
##  Min.   :  2.0   Min.   : 7.20   Min.   :2017        Min.   :1         
##  1st Qu.:999.0   1st Qu.:16.00   1st Qu.:2020        1st Qu.:1         
##  Median :999.0   Median :17.00   Median :2020        Median :1         
##  Mean   :994.2   Mean   :17.46   Mean   :2020        Mean   :1         
##  3rd Qu.:999.0   3rd Qu.:19.00   3rd Qu.:2020        3rd Qu.:1         
##  Max.   :999.0   Max.   :20.00   Max.   :2021        Max.   :1         
##    math_exam       writing_exam     verbal_exam       total_exam  
##  Min.   : 334.0   Min.   : 329.0   Min.   : 323.0   Min.   :1046  
##  1st Qu.: 621.0   1st Qu.: 582.0   1st Qu.: 590.0   1st Qu.:1916  
##  Median : 708.0   Median : 669.0   Median : 670.0   Median :2091  
##  Mean   : 718.2   Mean   : 697.1   Mean   : 696.1   Mean   :2110  
##  3rd Qu.: 809.0   3rd Qu.: 799.0   3rd Qu.: 787.0   3rd Qu.:2307  
##  Max.   :1000.0   Max.   :1000.0   Max.   :1000.0   Max.   :2931  
##  max_payment_year min_payment_year num_transaction payment_value   
##  Min.   :2017     Min.   :2018     Min.   :1.000   Min.   :     1  
##  1st Qu.:2020     1st Qu.:2020     1st Qu.:2.000   1st Qu.:  1667  
##  Median :2020     Median :2020     Median :3.000   Median :  3228  
##  Mean   :2020     Mean   :2020     Mean   :3.046   Mean   :  3742  
##  3rd Qu.:2021     3rd Qu.:2021     3rd Qu.:4.000   3rd Qu.:  4855  
##  Max.   :2021     Max.   :2021     Max.   :5.000   Max.   :119615  
##   scholarship      financing_plan    scholarshiprefund  semester_age  
##  Min.   :0.00000   Min.   :0.00000   Min.   :0.00000   Min.   :13.00  
##  1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:0.00000   1st Qu.:18.00  
##  Median :0.00000   Median :0.00000   Median :0.00000   Median :19.00  
##  Mean   :0.01326   Mean   :0.08444   Mean   :0.00977   Mean   :19.43  
##  3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:0.00000   3rd Qu.:20.00  
##  Max.   :1.00000   Max.   :1.00000   Max.   :1.00000   Max.   :56.00  
##    final_note      semester_number mandatory_subject selective_subject
##  Min.   :  0.000   Min.   :1.000   Min.   :  0.00    Min.   :  0.0    
##  1st Qu.:  3.138   1st Qu.:2.000   1st Qu.: 77.25    1st Qu.: 52.0    
##  Median :  3.750   Median :3.000   Median :150.00    Median :101.0    
##  Mean   :293.455   Mean   :3.053   Mean   :150.53    Mean   :101.4    
##  3rd Qu.:999.000   3rd Qu.:4.000   3rd Qu.:222.75    3rd Qu.:153.0    
##  Max.   :999.000   Max.   :5.000   Max.   :300.00    Max.   :200.0    
##  total_credits   number_mandatory_subject number_elective_subject
##  Min.   :  0.0   Min.   :999              Min.   :999            
##  1st Qu.: 53.0   1st Qu.:999              1st Qu.:999            
##  Median :101.0   Median :999              Median :999            
##  Mean   :100.5   Mean   :999              Mean   :999            
##  3rd Qu.:149.0   3rd Qu.:999              3rd Qu.:999            
##  Max.   :200.0   Max.   :999              Max.   :999            
##  number_selective_subject last_semester    student_dropout   target  
##  Min.   : 0.000           Min.   :202010   Min.   :0.00000   0:2735  
##  1st Qu.: 2.000           1st Qu.:202120   1st Qu.:0.00000   1: 131  
##  Median : 5.000           Median :202120   Median :0.00000           
##  Mean   : 4.997           Mean   :202113   Mean   :0.04571           
##  3rd Qu.: 8.000           3rd Qu.:202120   3rd Qu.:0.00000           
##  Max.   :10.000           Max.   :202120   Max.   :1.00000
# Deskripsi numerik
describe(data[, sapply(data, is.numeric)])
##                             vars    n      mean      sd    median   trimmed
## min_periodo                    1 2866 202051.07   47.71 202020.00 202048.65
## max_periodo                    2 2866 202112.61   24.19 202120.00 202118.89
## can_periodo                    3 2866      3.05    1.35      3.00      3.06
## gender                         4 2866      1.51    0.50      2.00      1.51
## birth_date                     5 2866   2001.37    3.40   2002.00   2002.03
## birth_month                    6 2866      6.45    3.47      6.00      6.45
## start_age                      7 2866     18.74    1.47     19.00     18.69
## ethnicity                      8 2866      4.04    1.79      4.00      4.04
## marital_status                 9 2866      5.94    0.46      6.00      6.00
## citizenship                   10 2866      1.04    0.30      1.00      1.00
## status_academy                11 2866     16.03    2.28     16.00     16.00
## start_semester                12 2866 202051.07   47.71 202020.00 202048.65
## last_activity                 13 2866 202112.91   23.72 202120.00 202118.98
## degree                        14 2866     24.75   14.07     27.00     24.88
## faculty                       15 2866     16.36    5.73     16.00     15.66
## num_career                    16 2866      1.07    0.27      1.00      1.00
## num_subject                   17 2866    999.00    0.00    999.00    999.00
## max_note_semester             18 2866    293.54  452.80      4.00    242.19
## min_note_semester             19 2866    293.45  452.86      4.00    242.07
## number_payment                20 2866      2.21    8.96      0.00      0.00
## last_payment_year             21 2866   1090.18  291.18    999.00    999.00
## note_last_semester            22 2866    354.14  476.88      4.00    317.89
## school_preparation_problem    23 2866      2.00    0.00      2.00      2.00
## academy_performance_problem   24 2866      7.45   85.20      0.00      0.04
## academy_change                25 2866      0.00    0.00      0.00      0.00
## academy_orientation           26 2866      1.00    0.00      1.00      1.00
## admission_semester            27 2866 202051.07   47.71 202020.00 202048.65
## academic_level                28 2866      4.84    0.68      5.00      5.00
## aplication_year               29 2866   2020.06    0.68   2020.00   2020.09
## aplication_month              30 2866      5.79    3.78      5.00      5.62
## faculty_admission             31 2866     16.36    5.73     16.00     15.66
## school                        32 2866     55.21  109.32     43.00     41.13
## degree_admission              33 2866     68.67  195.29     27.00     29.11
## college                       34 2866    362.05  286.37    288.00    322.74
## college_city                  35 2866    112.24  236.84     55.00     52.84
## college_province              36 2866     73.33  225.91     20.00     19.38
## college_award                 37 2866    994.16   69.06    999.00    999.00
## college_note                  38 2866     17.46    1.57     17.00     17.46
## admission_exam_year           39 2866   2020.06    0.68   2020.00   2020.09
## admission_exam_day            40 2866      1.00    0.00      1.00      1.00
## math_exam                     41 2866    718.18  133.55    708.00    714.73
## writing_exam                  42 2866    697.08  140.96    669.00    688.33
## verbal_exam                   43 2866    696.13  137.87    670.00    687.55
## total_exam                    44 2866   2109.98  292.35   2091.00   2106.48
## max_payment_year              45 2866   2020.36    0.55   2020.00   2020.36
## min_payment_year              46 2866   2020.37    0.53   2020.00   2020.36
## num_transaction               47 2866      3.05    1.35      3.00      3.06
## payment_value                 48 2866   3741.75 4250.82   3227.50   3257.55
## scholarship                   49 2866      0.01    0.11      0.00      0.00
## financing_plan                50 2866      0.08    0.28      0.00      0.00
## scholarshiprefund             51 2866      0.01    0.10      0.00      0.00
## semester_age                  52 2866     19.43    2.16     19.00     19.04
## final_note                    53 2866    293.45  452.85      3.75    242.08
## semester_number               54 2866      3.05    1.35      3.00      3.06
## mandatory_subject             55 2866    150.53   85.39    150.00    150.58
## selective_subject             56 2866    101.39   57.25    101.00    101.53
## total_credits                 57 2866    100.51   57.42    101.00    100.64
## number_mandatory_subject      58 2866    999.00    0.00    999.00    999.00
## number_elective_subject       59 2866    999.00    0.00    999.00    999.00
## number_selective_subject      60 2866      5.00    2.97      5.00      4.99
## last_semester                 61 2866 202112.91   23.72 202120.00 202118.98
## student_dropout               62 2866      0.05    0.21      0.00      0.00
##                                 mad      min    max    range   skew kurtosis
## min_periodo                   14.83 202010.0 202120    110.0   0.43    -1.79
## max_periodo                    0.00 202010.0 202120    110.0  -3.63    11.51
## can_periodo                    1.48      1.0      5      4.0   0.18    -1.38
## gender                         0.00      1.0      3      2.0  -0.02    -1.98
## birth_date                     1.48   1968.0   2021     53.0  -3.48    22.45
## birth_month                    4.45      1.0     12     11.0   0.02    -1.20
## start_age                      1.48      0.0     28     28.0  -2.78    47.90
## ethnicity                      2.97      1.0      7      6.0  -0.03    -1.11
## marital_status                 0.00      2.0      8      6.0  -7.77    62.39
## citizenship                    0.00      1.0      4      3.0   6.57    41.57
## status_academy                 0.00      8.0     32     24.0   2.67    26.28
## start_semester                14.83 202010.0 202120    110.0   0.43    -1.79
## last_activity                  0.00 202010.0 202120    110.0  -3.72    12.19
## degree                        13.34      1.0     99     98.0   0.17     0.61
## faculty                        4.45     10.0     29     19.0   0.92    -0.09
## num_career                     0.00      1.0      3      2.0   3.79    14.23
## num_subject                    0.00    999.0    999      0.0    NaN      NaN
## max_note_semester              1.48      0.0    999    999.0   0.92    -1.16
## min_note_semester              1.48      0.0    999    999.0   0.92    -1.16
## number_payment                 0.00      0.0    116    116.0   5.73    41.12
## last_payment_year              0.00    999.0   2020   1021.0   2.88     6.29
## note_last_semester             5.93      0.0    999    999.0   0.61    -1.63
## school_preparation_problem     0.00      2.0      2      0.0    NaN      NaN
## academy_performance_problem    0.00      0.0    999    999.0  11.55   131.39
## academy_change                 0.00      0.0      0      0.0    NaN      NaN
## academy_orientation            0.00      1.0      1      0.0    NaN      NaN
## admission_semester            14.83 202010.0 202120    110.0   0.43    -1.79
## academic_level                 0.00      1.0      5      4.0  -4.01    14.19
## aplication_year                0.00   2017.0   2021      4.0  -0.35     0.23
## aplication_month               4.45      1.0     12     11.0   0.28    -1.34
## faculty_admission              4.45     10.0     29     19.0   0.92    -0.09
## school                        19.27      4.0    999    995.0   8.08    66.79
## degree_admission              23.72      2.0    999    997.0   4.51    18.57
## college                      225.36      3.0    999    996.0   1.20     0.55
## college_city                   0.00      1.0    999    998.0   3.46    10.04
## college_province               0.00      1.0    999    998.0   3.85    12.84
## college_award                  0.00      2.0    999    997.0 -14.20   199.61
## college_note                   1.48      7.2     20     12.8  -0.38     1.19
## admission_exam_year            0.00   2017.0   2021      4.0  -0.35     0.23
## admission_exam_day             0.00      1.0      1      0.0    NaN      NaN
## math_exam                    138.62    334.0   1000    666.0   0.19    -0.69
## writing_exam                 145.29    329.0   1000    671.0   0.49    -0.74
## verbal_exam                  139.36    323.0   1000    677.0   0.48    -0.67
## total_exam                   289.11   1046.0   2931   1885.0   0.10    -0.23
## max_payment_year               0.00   2017.0   2021      4.0  -0.13    -0.25
## min_payment_year               0.00   2018.0   2021      3.0   0.06    -0.92
## num_transaction                1.48      1.0      5      4.0   0.18    -1.38
## payment_value               2372.90      1.0 119615 119614.0  10.47   222.91
## scholarship                    0.00      0.0      1      1.0   8.51    70.38
## financing_plan                 0.00      0.0      1      1.0   2.99     6.93
## scholarshiprefund              0.00      0.0      1      1.0   9.96    97.30
## semester_age                   1.48     13.0     56     43.0   5.24    54.53
## final_note                     1.25      0.0    999    999.0   0.92    -1.16
## semester_number                1.48      1.0      5      4.0   0.17    -1.39
## mandatory_subject            108.23      0.0    300    300.0   0.00    -1.17
## selective_subject             74.13      0.0    200    200.0   0.00    -1.20
## total_credits                 71.16      0.0    200    200.0  -0.01    -1.17
## number_mandatory_subject       0.00    999.0    999      0.0    NaN      NaN
## number_elective_subject        0.00    999.0    999      0.0    NaN      NaN
## number_selective_subject       4.45      0.0     10     10.0   0.02    -1.21
## last_semester                  0.00 202010.0 202120    110.0  -3.72    12.19
## student_dropout                0.00      0.0      1      1.0   4.35    16.91
##                                se
## min_periodo                  0.89
## max_periodo                  0.45
## can_periodo                  0.03
## gender                       0.01
## birth_date                   0.06
## birth_month                  0.06
## start_age                    0.03
## ethnicity                    0.03
## marital_status               0.01
## citizenship                  0.01
## status_academy               0.04
## start_semester               0.89
## last_activity                0.44
## degree                       0.26
## faculty                      0.11
## num_career                   0.00
## num_subject                  0.00
## max_note_semester            8.46
## min_note_semester            8.46
## number_payment               0.17
## last_payment_year            5.44
## note_last_semester           8.91
## school_preparation_problem   0.00
## academy_performance_problem  1.59
## academy_change               0.00
## academy_orientation          0.00
## admission_semester           0.89
## academic_level               0.01
## aplication_year              0.01
## aplication_month             0.07
## faculty_admission            0.11
## school                       2.04
## degree_admission             3.65
## college                      5.35
## college_city                 4.42
## college_province             4.22
## college_award                1.29
## college_note                 0.03
## admission_exam_year          0.01
## admission_exam_day           0.00
## math_exam                    2.49
## writing_exam                 2.63
## verbal_exam                  2.58
## total_exam                   5.46
## max_payment_year             0.01
## min_payment_year             0.01
## num_transaction              0.03
## payment_value               79.40
## scholarship                  0.00
## financing_plan               0.01
## scholarshiprefund            0.00
## semester_age                 0.04
## final_note                   8.46
## semester_number              0.03
## mandatory_subject            1.60
## selective_subject            1.07
## total_credits                1.07
## number_mandatory_subject     0.00
## number_elective_subject      0.00
## number_selective_subject     0.06
## last_semester                0.44
## student_dropout              0.00
# Korelasi
num_cols <- sapply(data, is.numeric)

# Identifikasi kolom dengan SD nol
sd_zero_cols <- names(data)[num_cols][sapply(data[, num_cols], function(x) sd(x, na.rm = TRUE) == 0)]

# Buang variabel dengan SD nol
data <- data[, !(names(data) %in% sd_zero_cols)]

# Korelasi setelah pembersihan
corr_matrix <- cor(data[, sapply(data, is.numeric)], use = "complete.obs")
corrplot(corr_matrix, method = "color", type = "upper", tl.cex = 0.6)

# Variabel terpilih
selected_vars <- c("final_note", "semester_number", "total_credits", "math_exam", "verbal_exam", "payment_value")

# Split data
set.seed(123)
split <- sample.split(data$target, SplitRatio = 0.7)
train <- subset(data, split == TRUE)
test <- subset(data, split == FALSE)


# Model regresi logistik
formula_log <- as.formula(paste("target ~", paste(selected_vars, collapse = " + ")))
log_model <- glm(formula_log, data = train, family = "binomial")
summary(log_model)
## 
## Call:
## glm(formula = formula_log, family = "binomial", data = train)
## 
## Coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      2.635e+00  9.641e-01   2.733  0.00627 ** 
## final_note      -8.430e-03  4.518e-03  -1.866  0.06204 .  
## semester_number -2.175e+00  2.075e-01 -10.485  < 2e-16 ***
## total_credits   -1.406e-03  2.374e-03  -0.592  0.55366    
## math_exam        8.743e-04  1.002e-03   0.872  0.38307    
## verbal_exam     -1.144e-03  1.027e-03  -1.114  0.26516    
## payment_value    4.967e-05  2.407e-05   2.063  0.03908 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 746.82  on 2005  degrees of freedom
## Residual deviance: 381.63  on 1999  degrees of freedom
## AIC: 395.63
## 
## Number of Fisher Scoring iterations: 11
# VIF (multikolienaritas)
vif(log_model)
##      final_note semester_number   total_credits       math_exam     verbal_exam 
##        1.000457        1.065900        1.027318        1.092876        1.078042 
##   payment_value 
##        1.041177
# Influence plot
influencePlot(log_model, id.method = "identify", main = "Influence Plot", sub = "Circle size is proportional to Cook's distance")
## Warning in plot.window(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy, type, ...): "id.method" is not a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in axis(side = side, at = at, labels = labels, ...): "id.method" is not
## a graphical parameter
## Warning in box(...): "id.method" is not a graphical parameter
## Warning in title(...): "id.method" is not a graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "id.method" is not a
## graphical parameter

##         StudRes         Hat        CookD
## 1    -1.2952899 0.585845370 0.2725643505
## 2     0.2041173 0.141990019 0.0005348591
## 1442  2.9907334 0.001940802 0.0222671779
## 1498  2.8970107 0.003475307 0.0294724274
# QQ Plot residual deviance
resid_log <- residuals(log_model, type = "deviance")
qqnorm(resid_log)
qqline(resid_log, col = "red")

# Hosmer-Lemeshow test
hoslem.test(as.numeric(train$target) - 1, fitted(log_model))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  as.numeric(train$target) - 1, fitted(log_model)
## X-squared = 1.0069, df = 8, p-value = 0.9982
# Prediksi dan confusion matrix (logistik)
log_prob <- predict(log_model, newdata = test, type = "response")
log_class <- ifelse(log_prob > 0.5, 1, 0) %>% as.factor()
confusionMatrix(log_class, test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 798  16
##          1  23  23
##                                           
##                Accuracy : 0.9547          
##                  95% CI : (0.9385, 0.9676)
##     No Information Rate : 0.9547          
##     P-Value [Acc > NIR] : 0.5425          
##                                           
##                   Kappa : 0.5175          
##                                           
##  Mcnemar's Test P-Value : 0.3367          
##                                           
##             Sensitivity : 0.9720          
##             Specificity : 0.5897          
##          Pos Pred Value : 0.9803          
##          Neg Pred Value : 0.5000          
##              Prevalence : 0.9547          
##          Detection Rate : 0.9279          
##    Detection Prevalence : 0.9465          
##       Balanced Accuracy : 0.7809          
##                                           
##        'Positive' Class : 0               
## 
# LDA
lda_formula <- as.formula(paste("target ~", paste(selected_vars, collapse = " + ")))
lda_model <- lda(lda_formula, data = train)
lda_pred <- predict(lda_model, test)
confusionMatrix(lda_pred$class, test$target)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 792  15
##          1  29  24
##                                           
##                Accuracy : 0.9488          
##                  95% CI : (0.9319, 0.9626)
##     No Information Rate : 0.9547          
##     P-Value [Acc > NIR] : 0.81769         
##                                           
##                   Kappa : 0.4954          
##                                           
##  Mcnemar's Test P-Value : 0.05002         
##                                           
##             Sensitivity : 0.9647          
##             Specificity : 0.6154          
##          Pos Pred Value : 0.9814          
##          Neg Pred Value : 0.4528          
##              Prevalence : 0.9547          
##          Detection Rate : 0.9209          
##    Detection Prevalence : 0.9384          
##       Balanced Accuracy : 0.7900          
##                                           
##        'Positive' Class : 0               
## 
# Uji normalitas univariat (Shapiro-Wilk) per kelas
cat("Uji normalitas univariat (Shapiro-Wilk) per kelas:\n")
## Uji normalitas univariat (Shapiro-Wilk) per kelas:
for (v in selected_vars) {
  for (g in levels(train$target)) {
    x <- train[train$target == g, v]
    if (length(unique(x)) > 1) {
      sw <- shapiro.test(x)
      cat(sprintf("Variabel: %s, Group: %s → p-value: %.4f\n", v, g, sw$p.value))
    }
  }
}
## Variabel: final_note, Group: 0 → p-value: 0.0000
## Variabel: final_note, Group: 1 → p-value: 0.0000
## Variabel: semester_number, Group: 0 → p-value: 0.0000
## Variabel: semester_number, Group: 1 → p-value: 0.0000
## Variabel: total_credits, Group: 0 → p-value: 0.0000
## Variabel: total_credits, Group: 1 → p-value: 0.0559
## Variabel: math_exam, Group: 0 → p-value: 0.0000
## Variabel: math_exam, Group: 1 → p-value: 0.2022
## Variabel: verbal_exam, Group: 0 → p-value: 0.0000
## Variabel: verbal_exam, Group: 1 → p-value: 0.0037
## Variabel: payment_value, Group: 0 → p-value: 0.0000
## Variabel: payment_value, Group: 1 → p-value: 0.0000
# Uji normalitas multivariat (Mardia)
if (!require(MVN)) install.packages("MVN")
## Loading required package: MVN
library(MVN)
cat("\nUji Mardia untuk multivariat normalitas:\n")
## 
## Uji Mardia untuk multivariat normalitas:
mvn_result <- mvn(data = train[, selected_vars], mvnTest = "mardia")
print(mvn_result$multivariateNormality)
##              Test        Statistic p value Result
## 1 Mardia Skewness 49251.8740969285       0     NO
## 2 Mardia Kurtosis 574.496723683397       0     NO
## 3             MVN             <NA>    <NA>     NO
# Uji Box's M
cat("\nUji Box's M untuk kesamaan matriks kovarians:\n")
## 
## Uji Box's M untuk kesamaan matriks kovarians:
boxm <- boxM(train[, selected_vars], train$target)
print(boxm)
## 
##  Box's M-test for Homogeneity of Covariance Matrices
## 
## data:  train[, selected_vars]
## Chi-Sq (approx.) = 1537.4, df = 21, p-value < 2.2e-16
# Korelasi antar variabel prediktor
cat("\nKorelasi antar variabel prediktor:\n")
## 
## Korelasi antar variabel prediktor:
corr_matrix_lda <- cor(train[, selected_vars])
print(round(corr_matrix_lda, 2))
##                 final_note semester_number total_credits math_exam verbal_exam
## final_note            1.00           -0.60         -0.03      0.13        0.13
## semester_number      -0.60            1.00         -0.01     -0.10       -0.14
## total_credits        -0.03           -0.01          1.00     -0.04       -0.01
## math_exam             0.13           -0.10         -0.04      1.00        0.24
## verbal_exam           0.13           -0.14         -0.01      0.24        1.00
## payment_value        -0.11            0.10         -0.04     -0.09       -0.09
##                 payment_value
## final_note              -0.11
## semester_number          0.10
## total_credits           -0.04
## math_exam               -0.09
## verbal_exam             -0.09
## payment_value            1.00
corrplot(corr_matrix_lda, method = "number", type = "upper", tl.cex = 0.7)

# Visualisasi
corrplot(cor(train[, selected_vars], use = "complete.obs"), method = "number", type = "upper", tl.cex = 0.7)

ggplot(data, aes(x = target, y = final_note, fill = target)) +
  geom_boxplot() +
  labs(title = "Boxplot Final Note berdasarkan Status Dropout",
       x = "Status Dropout", y = "Final Note") +
  theme_minimal()

# ROC dan AUC
roc_obj <- roc(as.numeric(test$target) - 1, log_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(roc_obj, col = "blue", lwd = 2, main = "ROC Curve - Regresi Logistik")
abline(a = 0, b = 1, lty = 2, col = "gray")

cat("AUC:", auc(roc_obj), "\n")
## AUC: 0.9369437
# Confusion Matrix Plot
cm <- confusionMatrix(log_class, test$target)
cm_df <- as.data.frame(cm$table)
ggplot(cm_df, aes(Prediction, Reference, fill = Freq)) +
  geom_tile(color = "white") +
  geom_text(aes(label = Freq), size = 5) +
  scale_fill_gradient(low = "white", high = "steelblue") +
  labs(title = "Confusion Matrix - Regresi Logistik", x = "Prediksi", y = "Kenyataan") +
  theme_minimal()

Including Plots

You can also embed plots, for example:

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.