#LOAD LIBRARY
library(e1071)
library(class)
## Warning: package 'class' was built under R version 4.4.3
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: ggplot2
## Loading required package: lattice
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
## The following object is masked from 'package:ggplot2':
##
## margin
library(rpart)
#LOAD DATA
train.data <- read_excel("C:/R/datatraining.xlsx", sheet = "datatraining")
test.data <- read_excel("C:/R/datatesting.xlsx")
test.data.ori <- test.data
#PREPROCESSING
train.data$usia <- as.integer(train.data$usia)
train.data[c("jenis_kelamin", "dukungan_orang_tua", "fasilitas_belajar","minat_pada_pelajaran", "kesulitan_ekonomi", "motivasi_belajar")]<-lapply(train.data [c("jenis_kelamin", "dukungan_orang_tua", "fasilitas_belajar", "minat_pada_pelajaran", "kesulitan_ekonomi", "motivasi_belajar")], as.factor)
str(train.data)
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
## $ usia : int [1:200] 15 19 15 15 16 18 16 16 15 21 ...
## $ jenis_kelamin : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 1 2 2 ...
## $ nilai_rata_rata : num [1:200] 85.2 60.9 60.5 78.6 66.2 85.9 45.2 91.6 74.1 88.2 ...
## $ dukungan_orang_tua : Factor w/ 3 levels "1","2","3": 1 2 2 3 2 2 1 2 2 3 ...
## $ fasilitas_belajar : Factor w/ 3 levels "1","2","3": 2 3 1 2 1 1 3 2 1 3 ...
## $ jam_belajar_per_hari: num [1:200] 4.1 3.8 6.1 5.1 5.5 3.7 2.8 6.2 4.6 3 ...
## $ kehadiran_persen : num [1:200] 79.7 89.8 86.7 78.5 81 85.4 78.9 78.1 96.8 82.1 ...
## $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 2 1 3 1 2 1 2 2 3 ...
## $ kesulitan_ekonomi : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 1 1 1 2 ...
## $ jarak_rumah_sekolah : num [1:200] 11.5 6.3 15.9 9.8 3.5 1 6.4 4.9 6.5 9.5 ...
## $ motivasi_belajar : Factor w/ 3 levels "1","2","3": 1 2 1 2 1 1 1 2 1 3 ...
test.data$usia <- as.integer(test.data$usia)
test.data[c("jenis_kelamin", "dukungan_orang_tua", "fasilitas_belajar","minat_pada_pelajaran", "kesulitan_ekonomi")]<-lapply(test.data [c("jenis_kelamin", "dukungan_orang_tua", "fasilitas_belajar", "minat_pada_pelajaran", "kesulitan_ekonomi")], as.factor)
str(test.data)
## tibble [15 × 10] (S3: tbl_df/tbl/data.frame)
## $ usia : int [1:15] 15 19 17 20 16 21 21 19 17 18 ...
## $ jenis_kelamin : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 2 1 1 ...
## $ nilai_rata_rata : num [1:15] 90.2 79.6 66.9 85.6 65.9 70.8 86.3 84.3 79.5 74.4 ...
## $ dukungan_orang_tua : Factor w/ 3 levels "1","2","3": 2 1 2 3 3 2 3 3 2 1 ...
## $ fasilitas_belajar : Factor w/ 3 levels "1","2","3": 2 1 3 1 1 3 2 3 2 2 ...
## $ jam_belajar_per_hari: num [1:15] 5.4 4.6 2.9 3.9 5.3 1.9 4.4 1.5 2 5.5 ...
## $ kehadiran_persen : num [1:15] 78.3 88.2 76.3 89.4 74 100 100 92.1 84.5 90.3 ...
## $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 3 1 1 1 1 1 1 3 3 ...
## $ kesulitan_ekonomi : Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 1 1 1 ...
## $ jarak_rumah_sekolah : num [1:15] 4.7 10.8 11.8 5.7 3.1 7.1 8.2 5.4 4.2 10.7 ...
DETEKSI NA
colSums(is.na(train.data))
## usia jenis_kelamin nilai_rata_rata
## 0 0 0
## dukungan_orang_tua fasilitas_belajar jam_belajar_per_hari
## 0 0 0
## kehadiran_persen minat_pada_pelajaran kesulitan_ekonomi
## 0 0 0
## jarak_rumah_sekolah motivasi_belajar
## 0 0
colSums(is.na(test.data))
## usia jenis_kelamin nilai_rata_rata
## 0 0 0
## dukungan_orang_tua fasilitas_belajar jam_belajar_per_hari
## 0 0 0
## kehadiran_persen minat_pada_pelajaran kesulitan_ekonomi
## 0 0 0
## jarak_rumah_sekolah
## 0
TIDAK ADA NA
DETEKSI DUPLIKAT
sum(duplicated(train.data))
## [1] 0
sum(duplicated(test.data))
## [1] 0
TIDAK ADA DUPLIKASI
DETEKSI & PENANGANAN OUTLIER
is_outlier <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
x < (Q1 - 1.5 * IQR) | x > (Q3 + 1.5 * IQR)}
sapply(train.data[, sapply(train.data, is.numeric)], function(x) sum(is_outlier(x)))
## usia nilai_rata_rata jam_belajar_per_hari
## 0 2 1
## kehadiran_persen jarak_rumah_sekolah
## 0 1
sapply(test.data[, sapply(test.data, is.numeric)], function(x) sum(is_outlier(x)))
## usia nilai_rata_rata jam_belajar_per_hari
## 0 0 0
## kehadiran_persen jarak_rumah_sekolah
## 0 0
DITEMUKAN OUTLIER PADA DATA TRAINING nilai_rata_rata, jam_belajar_per_hari, jarak_rumah_sekolah, MAKA DILAKUKAN PENANGANAN OUTLIER
winsorize_column <- function(x) {
Q1 <- quantile(x, 0.25, na.rm = TRUE)
Q3 <- quantile(x, 0.75, na.rm = TRUE)
IQR <- Q3 - Q1
lower <- Q1 - 1.5 * IQR
upper <- Q3 + 1.5 * IQR
x[x < lower] <- lower
x[x > upper] <- upper
return(x)}
cols_outlier <- c("nilai_rata_rata", "jam_belajar_per_hari", "jarak_rumah_sekolah")
train.data[cols_outlier] <- lapply(train.data[cols_outlier], winsorize_column)
sapply(train.data[, sapply(train.data, is.numeric)], function(x) sum(is_outlier(x)))
## usia nilai_rata_rata jam_belajar_per_hari
## 0 0 0
## kehadiran_persen jarak_rumah_sekolah
## 0 0
SPLIT TRAINING DAN TESTING
set.seed(123)
index <- createDataPartition(train.data$motivasi_belajar, p = 0.8, list = FALSE)
train_split <- train.data[index, ]
test_split <- train.data[-index, ]
#MODEL SVM
svm_model <- svm(motivasi_belajar ~ ., data = train_split, kernel = "radial")
svm_pred <- predict(svm_model, newdata = test_split)
confusionMatrix(svm_pred, test_split$motivasi_belajar)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 8 1 0
## 2 8 20 2
## 3 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7179
## 95% CI : (0.5513, 0.85)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 0.01707
##
## Kappa : 0.4257
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.5000 0.9524 0.00000
## Specificity 0.9565 0.4444 1.00000
## Pos Pred Value 0.8889 0.6667 NaN
## Neg Pred Value 0.7333 0.8889 0.94872
## Prevalence 0.4103 0.5385 0.05128
## Detection Rate 0.2051 0.5128 0.00000
## Detection Prevalence 0.2308 0.7692 0.00000
## Balanced Accuracy 0.7283 0.6984 0.50000
#MODEL RANDOM FOREST
set.seed(123)
rf_model <- randomForest(motivasi_belajar ~ ., data = train_split, ntree = 100)
rf_pred <- predict(rf_model, newdata = test_split)
confusionMatrix(rf_pred, test_split$motivasi_belajar)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 10 1 0
## 2 6 20 2
## 3 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.7692
## 95% CI : (0.6067, 0.8887)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 0.002543
##
## Kappa : 0.5363
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.6250 0.9524 0.00000
## Specificity 0.9565 0.5556 1.00000
## Pos Pred Value 0.9091 0.7143 NaN
## Neg Pred Value 0.7857 0.9091 0.94872
## Prevalence 0.4103 0.5385 0.05128
## Detection Rate 0.2564 0.5128 0.00000
## Detection Prevalence 0.2821 0.7179 0.00000
## Balanced Accuracy 0.7908 0.7540 0.50000
#MODEL DECISION TREE
dt_model <- rpart(motivasi_belajar ~ ., data = train_split, method = "class")
dt_pred <- predict(dt_model, newdata = test_split, type = "class")
confusionMatrix(dt_pred, test_split$motivasi_belajar)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 1 2 3
## 1 10 4 0
## 2 6 17 2
## 3 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.6923
## 95% CI : (0.5243, 0.8298)
## No Information Rate : 0.5385
## P-Value [Acc > NIR] : 0.0372
##
## Kappa : 0.3938
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 1 Class: 2 Class: 3
## Sensitivity 0.6250 0.8095 0.00000
## Specificity 0.8261 0.5556 1.00000
## Pos Pred Value 0.7143 0.6800 NaN
## Neg Pred Value 0.7600 0.7143 0.94872
## Prevalence 0.4103 0.5385 0.05128
## Detection Rate 0.2564 0.4359 0.00000
## Detection Prevalence 0.3590 0.6410 0.00000
## Balanced Accuracy 0.7255 0.6825 0.50000
#TESTING SVM
test.data.ori$motivasi_pred_svm_num <- as.numeric(predict(svm_model, newdata = test.data))
test.data.ori$motivasi_pred_svm_cat <- factor(test.data.ori$motivasi_pred_svm_num,levels = c(1, 2, 3),labels = c("Rendah", "Sedang", "Tinggi"))
print(test.data.ori[, c("motivasi_pred_svm_num", "motivasi_pred_svm_cat")])
## # A tibble: 15 × 2
## motivasi_pred_svm_num motivasi_pred_svm_cat
## <dbl> <fct>
## 1 2 Sedang
## 2 2 Sedang
## 3 1 Rendah
## 4 2 Sedang
## 5 1 Rendah
## 6 2 Sedang
## 7 2 Sedang
## 8 2 Sedang
## 9 2 Sedang
## 10 2 Sedang
## 11 1 Rendah
## 12 1 Rendah
## 13 2 Sedang
## 14 1 Rendah
## 15 1 Rendah
#TESTING RANDOM FOREST
test.data.ori$motivasi_pred_rf_num <- as.numeric(predict(rf_model, newdata = test.data))
test.data.ori$motivasi_pred_rf_cat <- factor(test.data.ori$motivasi_pred_rf_num, levels = 1:3, labels = c("Rendah", "Sedang", "Tinggi"))
print(test.data.ori[, c("motivasi_pred_rf_num", "motivasi_pred_rf_cat")])
## # A tibble: 15 × 2
## motivasi_pred_rf_num motivasi_pred_rf_cat
## <dbl> <fct>
## 1 2 Sedang
## 2 2 Sedang
## 3 1 Rendah
## 4 2 Sedang
## 5 1 Rendah
## 6 2 Sedang
## 7 2 Sedang
## 8 2 Sedang
## 9 2 Sedang
## 10 2 Sedang
## 11 1 Rendah
## 12 1 Rendah
## 13 2 Sedang
## 14 2 Sedang
## 15 1 Rendah
#TESTING DECISION TREE
test.data.ori$motivasi_pred_dt_num <- as.numeric(predict(dt_model, newdata = test.data, type = "class"))
test.data.ori$motivasi_pred_dt_cat <- factor(test.data.ori$motivasi_pred_dt_num, levels = 1:3, labels = c("Rendah", "Sedang", "Tinggi"))
print(test.data.ori[, c("motivasi_pred_dt_num", "motivasi_pred_dt_cat")])
## # A tibble: 15 × 2
## motivasi_pred_dt_num motivasi_pred_dt_cat
## <dbl> <fct>
## 1 2 Sedang
## 2 2 Sedang
## 3 2 Sedang
## 4 1 Rendah
## 5 1 Rendah
## 6 2 Sedang
## 7 1 Rendah
## 8 2 Sedang
## 9 1 Rendah
## 10 2 Sedang
## 11 1 Rendah
## 12 2 Sedang
## 13 2 Sedang
## 14 2 Sedang
## 15 1 Rendah