library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.3
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.4.3
## Warning: package 'readr' was built under R version 4.4.3
## Warning: package 'forcats' was built under R version 4.4.3
## Warning: package 'lubridate' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.5
## ✔ lubridate 1.9.4 ✔ stringr 1.5.1
## ✔ purrr 1.0.4 ✔ tibble 3.2.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(randomForest)
## Warning: package 'randomForest' was built under R version 4.4.3
## randomForest 4.7-1.2
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
##
## The following object is masked from 'package:ggplot2':
##
## margin
##
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Warning: package 'caret' was built under R version 4.4.3
## Loading required package: lattice
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
library(readxl)
## Warning: package 'readxl' was built under R version 4.4.3
library(randomForest) #for random forest
library(e1071) # for SVM
## Warning: package 'e1071' was built under R version 4.4.3
library(rpart) # for decision tree
## Warning: package 'rpart' was built under R version 4.4.3
data_train <- read_excel("D:/SEMESTER ^6^ DPEN/Data Mining/UAS/datatraining.xlsx")
data_test <- read_excel("D:/SEMESTER ^6^ DPEN/Data Mining/UAS/datatesting.xlsx")
# Cek struktur data
str(data_train)
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
## $ usia : num [1:200] 15 19 15 15 16 18 16 16 15 21 ...
## $ jenis_kelamin : chr [1:200] "0" "0" "0" "1" ...
## $ nilai_rata_rata : num [1:200] 85.2 60.9 60.5 78.6 66.2 85.9 45.2 91.6 74.1 88.2 ...
## $ dukungan_orang_tua : chr [1:200] "1" "2" "2" "3" ...
## $ fasilitas_belajar : chr [1:200] "2" "3" "1" "2" ...
## $ jam_belajar_per_hari: num [1:200] 4.1 3.8 6.1 5.1 5.5 3.7 2.8 6.2 4.6 3 ...
## $ kehadiran_persen : num [1:200] 79.7 89.8 86.7 78.5 81 85.4 78.9 78.1 96.8 82.1 ...
## $ minat_pada_pelajaran: chr [1:200] "2" "2" "1" "3" ...
## $ kesulitan_ekonomi : chr [1:200] "0" "1" "1" "1" ...
## $ jarak_rumah_sekolah : num [1:200] 11.5 6.3 15.9 9.8 3.5 1 6.4 4.9 6.5 9.5 ...
## $ motivasi_belajar : chr [1:200] "1" "2" "1" "2" ...
str(data_test)
## tibble [15 × 10] (S3: tbl_df/tbl/data.frame)
## $ usia : num [1:15] 15 19 17 20 16 21 21 19 17 18 ...
## $ jenis_kelamin : chr [1:15] "1" "0" "0" "1" ...
## $ nilai_rata_rata : num [1:15] 90.2 79.6 66.9 85.6 65.9 70.8 86.3 84.3 79.5 74.4 ...
## $ dukungan_orang_tua : chr [1:15] "2" "1" "2" "3" ...
## $ fasilitas_belajar : chr [1:15] "2" "1" "3" "1" ...
## $ jam_belajar_per_hari: num [1:15] 5.4 4.6 2.9 3.9 5.3 1.9 4.4 1.5 2 5.5 ...
## $ kehadiran_persen : num [1:15] 78.3 88.2 76.3 89.4 74 100 100 92.1 84.5 90.3 ...
## $ minat_pada_pelajaran: chr [1:15] "2" "3" "1" "1" ...
## $ kesulitan_ekonomi : chr [1:15] "1" "0" "0" "0" ...
## $ jarak_rumah_sekolah : num [1:15] 4.7 10.8 11.8 5.7 3.1 7.1 8.2 5.4 4.2 10.7 ...
# Ubah tipe kolom yang sesuai
factor_cols <- c("jenis_kelamin", "dukungan_orang_tua", "fasilitas_belajar",
"minat_pada_pelajaran", "kesulitan_ekonomi", "motivasi_belajar")
data_train[factor_cols] <- lapply(data_train[factor_cols], as.factor)
# Untuk data testing, tanpa motivasi_belajar
factor_cols_test <- setdiff(factor_cols, "motivasi_belajar")
data_test[factor_cols_test] <- lapply(data_test[factor_cols_test], as.factor)
# Konfirmasi perubahan
str(data_train)
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
## $ usia : num [1:200] 15 19 15 15 16 18 16 16 15 21 ...
## $ jenis_kelamin : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 1 2 2 ...
## $ nilai_rata_rata : num [1:200] 85.2 60.9 60.5 78.6 66.2 85.9 45.2 91.6 74.1 88.2 ...
## $ dukungan_orang_tua : Factor w/ 3 levels "1","2","3": 1 2 2 3 2 2 1 2 2 3 ...
## $ fasilitas_belajar : Factor w/ 3 levels "1","2","3": 2 3 1 2 1 1 3 2 1 3 ...
## $ jam_belajar_per_hari: num [1:200] 4.1 3.8 6.1 5.1 5.5 3.7 2.8 6.2 4.6 3 ...
## $ kehadiran_persen : num [1:200] 79.7 89.8 86.7 78.5 81 85.4 78.9 78.1 96.8 82.1 ...
## $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 2 1 3 1 2 1 2 2 3 ...
## $ kesulitan_ekonomi : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 1 1 1 2 ...
## $ jarak_rumah_sekolah : num [1:200] 11.5 6.3 15.9 9.8 3.5 1 6.4 4.9 6.5 9.5 ...
## $ motivasi_belajar : Factor w/ 3 levels "1","2","3": 1 2 1 2 1 1 1 2 1 3 ...
str(data_test)
## tibble [15 × 10] (S3: tbl_df/tbl/data.frame)
## $ usia : num [1:15] 15 19 17 20 16 21 21 19 17 18 ...
## $ jenis_kelamin : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 2 1 1 ...
## $ nilai_rata_rata : num [1:15] 90.2 79.6 66.9 85.6 65.9 70.8 86.3 84.3 79.5 74.4 ...
## $ dukungan_orang_tua : Factor w/ 3 levels "1","2","3": 2 1 2 3 3 2 3 3 2 1 ...
## $ fasilitas_belajar : Factor w/ 3 levels "1","2","3": 2 1 3 1 1 3 2 3 2 2 ...
## $ jam_belajar_per_hari: num [1:15] 5.4 4.6 2.9 3.9 5.3 1.9 4.4 1.5 2 5.5 ...
## $ kehadiran_persen : num [1:15] 78.3 88.2 76.3 89.4 74 100 100 92.1 84.5 90.3 ...
## $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 3 1 1 1 1 1 1 3 3 ...
## $ kesulitan_ekonomi : Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 1 1 1 ...
## $ jarak_rumah_sekolah : num [1:15] 4.7 10.8 11.8 5.7 3.1 7.1 8.2 5.4 4.2 10.7 ...
# Cek apakah ada nilai kosong
colSums(is.na(data_train))
## usia jenis_kelamin nilai_rata_rata
## 0 0 0
## dukungan_orang_tua fasilitas_belajar jam_belajar_per_hari
## 0 0 0
## kehadiran_persen minat_pada_pelajaran kesulitan_ekonomi
## 0 0 0
## jarak_rumah_sekolah motivasi_belajar
## 0 0
colSums(is.na(data_test))
## usia jenis_kelamin nilai_rata_rata
## 0 0 0
## dukungan_orang_tua fasilitas_belajar jam_belajar_per_hari
## 0 0 0
## kehadiran_persen minat_pada_pelajaran kesulitan_ekonomi
## 0 0 0
## jarak_rumah_sekolah
## 0
# Cek jumlah baris duplikat
sum(duplicated(data_train))
## [1] 0
sum(duplicated(data_test))
## [1] 0
# Hapus jika ada duplikat
data_train <- data_train[!duplicated(data_train), ]
data_test <- data_test[!duplicated(data_test), ]
str(data_train)
## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
## $ usia : num [1:200] 15 19 15 15 16 18 16 16 15 21 ...
## $ jenis_kelamin : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 1 2 2 ...
## $ nilai_rata_rata : num [1:200] 85.2 60.9 60.5 78.6 66.2 85.9 45.2 91.6 74.1 88.2 ...
## $ dukungan_orang_tua : Factor w/ 3 levels "1","2","3": 1 2 2 3 2 2 1 2 2 3 ...
## $ fasilitas_belajar : Factor w/ 3 levels "1","2","3": 2 3 1 2 1 1 3 2 1 3 ...
## $ jam_belajar_per_hari: num [1:200] 4.1 3.8 6.1 5.1 5.5 3.7 2.8 6.2 4.6 3 ...
## $ kehadiran_persen : num [1:200] 79.7 89.8 86.7 78.5 81 85.4 78.9 78.1 96.8 82.1 ...
## $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 2 1 3 1 2 1 2 2 3 ...
## $ kesulitan_ekonomi : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 1 1 1 2 ...
## $ jarak_rumah_sekolah : num [1:200] 11.5 6.3 15.9 9.8 3.5 1 6.4 4.9 6.5 9.5 ...
## $ motivasi_belajar : Factor w/ 3 levels "1","2","3": 1 2 1 2 1 1 1 2 1 3 ...
summary(data_train)
## usia jenis_kelamin nilai_rata_rata dukungan_orang_tua
## Min. :15.00 0: 95 Min. : 41.50 1: 39
## 1st Qu.:16.00 1:105 1st Qu.: 66.95 2:111
## Median :18.00 Median : 74.40 3: 50
## Mean :17.78 Mean : 74.14
## 3rd Qu.:20.00 3rd Qu.: 80.78
## Max. :21.00 Max. :100.00
## fasilitas_belajar jam_belajar_per_hari kehadiran_persen minat_pada_pelajaran
## 1:52 Min. :1.000 Min. : 66.30 1:65
## 2:88 1st Qu.:2.900 1st Qu.: 79.70 2:76
## 3:60 Median :4.050 Median : 86.10 3:59
## Mean :4.095 Mean : 85.53
## 3rd Qu.:5.100 3rd Qu.: 90.80
## Max. :8.800 Max. :100.00
## kesulitan_ekonomi jarak_rumah_sekolah motivasi_belajar
## 0:112 Min. : 1.000 1: 81
## 1: 88 1st Qu.: 5.475 2:105
## Median : 8.300 3: 14
## Mean : 8.315
## 3rd Qu.:11.300
## Max. :20.500
library(e1071)
model_svm <- svm(motivasi_belajar ~ ., data = data_train, kernel = "linear", probability = TRUE)
summary(model_svm)
##
## Call:
## svm(formula = motivasi_belajar ~ ., data = data_train, kernel = "linear",
## probability = TRUE)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 116
##
## ( 42 61 13 )
##
##
## Number of Classes: 3
##
## Levels:
## 1 2 3
library(randomForest)
model_rf <- randomForest(motivasi_belajar ~ ., data = data_train, ntree = 100, mtry = 3, importance = TRUE)
print(model_rf)
##
## Call:
## randomForest(formula = motivasi_belajar ~ ., data = data_train, ntree = 100, mtry = 3, importance = TRUE)
## Type of random forest: classification
## Number of trees: 100
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 30.5%
## Confusion matrix:
## 1 2 3 class.error
## 1 53 28 0 0.3456790
## 2 19 86 0 0.1809524
## 3 0 14 0 1.0000000
library(rpart)
model_dt <- rpart(motivasi_belajar ~ ., data = data_train, method = "class")
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.4.3
rpart.plot(model_dt)
#3_Prediksi Prediksi:Gunakan model yang telah dibuat untuk memprediksi
kategori motivasi_belajar pada 15 siswa (data testing).
# Prediksi menggunakan model SVM
pred_svm <- predict(model_svm, newdata = data_test)
# Tambahkan ke data_test
data_test$pred_svm <- pred_svm
# Prediksi menggunakan model Random Forest
pred_rf <- predict(model_rf, newdata = data_test)
# Tambahkan ke data_test
data_test$pred_rf <- pred_rf
# Prediksi menggunakan model Decision Tree
pred_dt <- predict(model_dt, newdata = data_test, type = "class")
# Tambahkan ke data_test
data_test$pred_dt <- pred_dt
# Tampilkan hasil prediksi untuk 15 siswa
hasil_prediksi <- data_test %>%
select(pred_svm, pred_rf, pred_dt)
print(hasil_prediksi)
## # A tibble: 15 × 3
## pred_svm pred_rf pred_dt
## <fct> <fct> <fct>
## 1 2 2 2
## 2 2 2 2
## 3 1 1 1
## 4 2 2 2
## 5 1 1 1
## 6 2 2 2
## 7 2 2 2
## 8 2 2 3
## 9 2 2 1
## 10 2 2 2
## 11 1 1 2
## 12 1 1 1
## 13 2 2 1
## 14 1 2 2
## 15 1 1 1