Prediction Kelulusan Mahasiswa
Import Data
Exploratory Data Analysis
Check Data
## 'data.frame': 19711 obs. of 54 variables:
## $ X.1 : int 1 2 3 4 5 6 7 8 9 10 ...
## $ X : int 1 2 3 4 5 6 7 8 9 10 ...
## $ NIM : num 1.41e+10 1.41e+10 1.41e+10 1.41e+10 1.41e+10 ...
## $ Fakultas : int 4 4 4 4 4 4 4 4 4 4 ...
## $ Prodi : int 12 12 12 12 12 12 12 12 12 12 ...
## $ Status : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Jalur.Masuk : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Jenis.Mahasiswa : int 3 3 3 3 3 3 3 3 3 3 ...
## $ Kelompok : int 2 1 2 2 2 2 2 2 2 2 ...
## $ Kerjasama : int 2 2 2 2 2 2 2 2 2 2 ...
## $ Jenis.Kelamin : int 2 2 2 2 1 1 2 2 2 2 ...
## $ Provinsi.Asal : int 5 9 10 5 5 3 11 11 10 11 ...
## $ Pendidikan.Ayah : int 6 6 2 6 6 6 9 7 6 8 ...
## $ Pendidikan.Ibu : int 6 6 3 6 6 6 8 7 6 8 ...
## $ Pekerjaan.Ayah : int 10 10 9 2 9 2 13 6 12 1 ...
## $ Pekerjaan.Ibu : int 14 14 9 14 14 2 12 10 14 1 ...
## $ Penghasilan.Ayah : int 6 3 10 5 9 4 1 4 2 3 ...
## $ Penghasilan.Ibu : int 1 1 10 1 1 4 3 2 1 2 ...
## $ Nilai.UAN : num 7.5 7.7 7.9 6.95 6.9 7.3 7.65 6 7.6 8.49 ...
## $ Provinsi.SMTA : int 5 9 10 5 5 5 11 11 10 11 ...
## $ Kebutuhan.Khusus.Mhs: int 13 13 13 13 13 13 13 13 13 13 ...
## $ Angkatan : int 2014 2014 2014 2014 2014 2014 2014 2014 2014 2014 ...
## $ IPK.2 : num 3.41 3.07 2.87 3.1 2.98 2.74 3.52 3.29 3.5 3.64 ...
## $ SKS.2 : int 23 23 23 23 23 23 23 23 23 23 ...
## $ Umur : int 18 18 18 18 19 17 18 18 18 18 ...
## $ IPK.1 : num 3.47 3.12 3.08 3.3 3.19 2.93 3.57 3.37 3.61 3.62 ...
## $ SKS.1 : int 47 47 47 47 47 47 47 47 47 47 ...
## $ IPK.3 : num 3.46 3.24 3.24 3.4 3.26 3.03 3.55 3.44 3.64 3.67 ...
## $ SKS.3 : int 69 69 69 69 69 69 69 69 69 69 ...
## $ IPK.4 : num 3.5 3.25 3.28 3.45 3.33 3.06 3.61 3.49 3.63 3.7 ...
## $ SKS.4 : int 93 93 93 93 93 93 93 93 93 93 ...
## $ IPK.5 : num 3.52 3.26 3.31 3.46 3.41 3.17 3.65 3.5 3.62 3.71 ...
## $ SKS.5 : int 114 114 114 114 114 114 114 114 114 114 ...
## $ IPK.6 : num 3.48 3.23 3.33 3.47 3.44 3.22 3.63 3.48 3.58 3.7 ...
## $ SKS.6 : int 132 135 135 135 135 135 132 132 132 132 ...
## $ IPK.7 : num 3.5 3.24 3.35 3.48 3.47 3.27 3.64 3.53 3.59 3.72 ...
## $ SKS.7 : int 144 144 144 144 144 144 144 144 144 144 ...
## $ IPK.8 : num 3.5 3.27 3.37 0 3.49 3.31 3.63 0 0 0 ...
## $ SKS.8 : int 144 141 141 0 144 144 141 0 0 0 ...
## $ IPK.9 : num 0 0 0 0 0 0 3.64 0 0 0 ...
## $ SKS.9 : int 0 0 0 0 0 0 144 0 0 0 ...
## $ IPK.10 : num 0 0 0 0 0 3.13 0 0 0 0 ...
## $ SKS.10 : int 0 0 0 0 0 144 0 0 0 0 ...
## $ IPK.11 : num 0 3.26 0 0 0 0 0 0 0 0 ...
## $ SKS.11 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ IPK.12 : num 0 3.29 0 0 0 0 0 0 0 0 ...
## $ SKS.12 : int 0 144 0 0 0 0 0 0 0 0 ...
## $ IPK.13 : num 0 0 0 0 0 3.27 0 0 0 0 ...
## $ SKS.13 : int 0 0 0 0 0 144 0 0 0 0 ...
## $ IPK.14 : num 0 0 0 0 0 3.27 0 0 0 0 ...
## $ SKS.14 : int 0 0 0 0 0 144 0 0 0 0 ...
## $ keterangan : int 1 1 1 1 1 1 0 1 1 1 ...
## $ semester_kelulusan : int 8 12 8 7 8 14 9 7 7 7 ...
## $ bulan_kelulusan : int 48 72 48 42 48 84 54 42 42 42 ...
Select Data
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
DATAKELULUSAN <- datkelulusan %>% select(-Status,-Angkatan,-NIM,-Fakultas,-Umur,-Jalur.Masuk,-Jenis.Mahasiswa,-Kerjasama,-Jenis.Kelamin,-Pendidikan.Ayah,-Pendidikan.Ibu,-Pekerjaan.Ayah,-Pekerjaan.Ibu,-Kebutuhan.Khusus.Mhs,Angkatan, -X,-X.1, -Nilai.UAN,-Provinsi.Asal, -Provinsi.SMTA,-keterangan ,-bulan_kelulusan, -Angkatan,-SKS.1,-SKS.2,-SKS.3,-SKS.4,-SKS.5,-SKS.6,-SKS.7,-SKS.8,-SKS.9,-SKS.10,-SKS.11,-SKS.12,-SKS.13,-SKS.14)
head(DATAKELULUSAN)## Prodi Kelompok Penghasilan.Ayah Penghasilan.Ibu IPK.2 IPK.1 IPK.3 IPK.4 IPK.5
## 1 12 2 6 1 3.41 3.47 3.46 3.50 3.52
## 2 12 1 3 1 3.07 3.12 3.24 3.25 3.26
## 3 12 2 10 10 2.87 3.08 3.24 3.28 3.31
## 4 12 2 5 1 3.10 3.30 3.40 3.45 3.46
## 5 12 2 9 1 2.98 3.19 3.26 3.33 3.41
## 6 12 2 4 4 2.74 2.93 3.03 3.06 3.17
## IPK.6 IPK.7 IPK.8 IPK.9 IPK.10 IPK.11 IPK.12 IPK.13 IPK.14 semester_kelulusan
## 1 3.48 3.50 3.50 0 0.00 0.00 0.00 0.00 0.00 8
## 2 3.23 3.24 3.27 0 0.00 3.26 3.29 0.00 0.00 12
## 3 3.33 3.35 3.37 0 0.00 0.00 0.00 0.00 0.00 8
## 4 3.47 3.48 0.00 0 0.00 0.00 0.00 0.00 0.00 7
## 5 3.44 3.47 3.49 0 0.00 0.00 0.00 0.00 0.00 8
## 6 3.22 3.27 3.31 0 3.13 0.00 0.00 3.27 3.27 14
Average IPK
Statistic Descriptive
## Prodi Beasiswa Penghasilan.Ayah Penghasilan.Ibu
## Min. : 1.00 Min. :1.000 Min. : 1.000 Min. : 1.000
## 1st Qu.:14.00 1st Qu.:2.000 1st Qu.: 3.000 1st Qu.: 1.000
## Median :27.00 Median :2.000 Median : 4.000 Median : 1.000
## Mean :27.35 Mean :1.831 Mean : 4.913 Mean : 2.929
## 3rd Qu.:39.00 3rd Qu.:2.000 3rd Qu.: 7.000 3rd Qu.: 4.000
## Max. :61.00 Max. :2.000 Max. :10.000 Max. :10.000
## NA's :3 NA's :13
## semester_kelulusan avg_ipk
## Min. : 2.000 Min. :1.940
## 1st Qu.: 7.000 1st Qu.:3.380
## Median : 8.000 Median :3.530
## Mean : 8.055 Mean :3.504
## 3rd Qu.: 9.000 3rd Qu.:3.650
## Max. :14.000 Max. :3.980
##
Check Missing Data
## Prodi Beasiswa Penghasilan.Ayah Penghasilan.Ibu
## 0 0 3 13
## semester_kelulusan avg_ipk
## 0 0
Distribution Target Variable
hist(DATAKELULUSAN$semester_kelulusan, main="Distribusi Semester Kelulusan",xlab="Semester Kelulusan")Check Outlier
par(mfrow = c(2,2))
vars <- c("Penghasilan.Ayah","Beasiswa","Penghasilan.Ibu","avg_ipk","semester_kelulusan")
for(i in vars) {
boxplot(DATAKELULUSAN[[i]],
main = i,
col = "lightgray",
ylab = "Nilai")
}#cek non visual (iqr)
cek_outlier_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
sum(
x < (Q1 - 1.5 * IQR_val) |
x > (Q3 + 1.5 * IQR_val),
na.rm = TRUE
)
}
vars_outlier <- c("Penghasilan.Ayah","Penghasilan.Ibu","avg_ipk","semester_kelulusan")
sapply(DATAKELULUSAN[, vars_outlier], cek_outlier_iqr)## Penghasilan.Ayah Penghasilan.Ibu avg_ipk semester_kelulusan
## 0 1888 350 676
Correlation Numeric Variable
## corrplot 0.95 loaded
num_cont <- DATAKELULUSAN[, c("avg_ipk","Beasiswa","semester_kelulusan","Penghasilan.Ayah","Penghasilan.Ibu")]
cor_mat <- cor(num_cont, use = "complete.obs", method = "pearson")
corrplot(cor_mat,
method = "color",
type = "upper",
tl.col = "red",
tl.srt = 45,
addCoef.col = "black",
number.cex = 1.5)Pre-Processing
Handling Missing Values
median_ayah <- median(DATAKELULUSAN$Penghasilan.Ayah, na.rm = TRUE)
DATAKELULUSAN$Penghasilan.Ayah[is.na(DATAKELULUSAN$Penghasilan.Ayah)] <- median_ayah
median_ibu <- median(DATAKELULUSAN$Penghasilan.Ibu, na.rm = TRUE)
DATAKELULUSAN$Penghasilan.Ibu[is.na(DATAKELULUSAN$Penghasilan.Ibu)] <- median_ibu
sapply(DATAKELULUSAN, function(x) sum(is.na(x)))## Prodi Beasiswa Penghasilan.Ayah Penghasilan.Ibu
## 0 0 0 0
## semester_kelulusan avg_ipk
## 0 0
Handling Outlier
winsor_iqr <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR_val <- Q3 - Q1
lower <- Q1 - 1.5 * IQR_val
upper <- Q3 + 1.5 * IQR_val
x[x < lower] <- lower
x[x > upper] <- upper
return(x)
}
DATAKELULUSAN[vars_outlier] <-lapply(DATAKELULUSAN[vars_outlier], winsor_iqr)
sapply(DATAKELULUSAN[, vars_outlier], cek_outlier_iqr)## Penghasilan.Ayah Penghasilan.Ibu avg_ipk semester_kelulusan
## 0 0 0 0
Splitting Data
## Loading required package: ggplot2
## Loading required package: lattice
Check Multikolineritas
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Prodi Beasiswa Penghasilan.Ayah Penghasilan.Ibu
## 1.011116 1.133084 1.093058 1.040131
## avg_ipk
## 1.026018
Modelling
Model Gradient Boosting
## Loaded gbm 2.2.3
## This version of gbm is no longer under development. Consider transitioning to gbm3, https://github.com/gbm-developers/gbm3
gb_model <- gbm(
semester_kelulusan ~ .,
data = data_train,
distribution = "gaussian",
n.trees = 1000,
interaction.depth = 3,
shrinkage = 0.05,
n.minobsinnode = 10,
cv.folds = 3)
best_iter <- gbm.perf(gb_model, method = "cv", plot.it = FALSE)
gb_pred <- predict(gb_model,newdata = data_test,n.trees = best_iter)Model Random Forest
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
## The following object is masked from 'package:dplyr':
##
## combine
Evaluation
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
rf_mae <- mae(data_test$semester_kelulusan, rf_pred)
rf_rmse <- rmse(data_test$semester_kelulusan, rf_pred)
rf_mape <- mape(data_test$semester_kelulusan, rf_pred)
gb_mae <- mae(data_test$semester_kelulusan, gb_pred)
gb_rmse <- rmse(data_test$semester_kelulusan, gb_pred)
gb_mape <- mape(data_test$semester_kelulusan, gb_pred)
hasil_evaluasi <- data.frame(
Model = c("Random Forest", "Gradient Boosting"),
MAE = c(rf_mae, gb_mae),
RMSE = c(rf_rmse, gb_rmse),
MAPE = c(rf_mape, gb_mape))
hasil_evaluasi## Model MAE RMSE MAPE
## 1 Random Forest 1.050235 1.405283 0.1321298
## 2 Gradient Boosting 1.024245 1.381200 0.1288727