UAS Data Mining

#instal library 
library(caret)

## Warning: package 'caret' was built under R version 4.4.3

## Loading required package: ggplot2

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.4.3

library(e1071)

## Warning: package 'e1071' was built under R version 4.4.3

library(randomForest)

## Warning: package 'randomForest' was built under R version 4.4.3

## randomForest 4.7-1.2

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

library(rpart)
library(readxl)

## Warning: package 'readxl' was built under R version 4.4.3

#inputdata
data_testing<-read_xlsx("C:/Users/HP Pavilion 14/Documents/Data Mining/datatesting.xlsx")
data_training<-read_xlsx("C:/Users/HP Pavilion 14/Documents/Data Mining/datatraining.xlsx")
summary(data_training)

##       usia       jenis_kelamin      nilai_rata_rata  dukungan_orang_tua
##  Min.   :15.00   Length:200         Min.   : 41.50   Length:200        
##  1st Qu.:16.00   Class :character   1st Qu.: 66.95   Class :character  
##  Median :18.00   Mode  :character   Median : 74.40   Mode  :character  
##  Mean   :17.78                      Mean   : 74.14                     
##  3rd Qu.:20.00                      3rd Qu.: 80.78                     
##  Max.   :21.00                      Max.   :100.00                     
##  fasilitas_belajar  jam_belajar_per_hari kehadiran_persen minat_pada_pelajaran
##  Length:200         Min.   :1.000        Min.   : 66.30   Length:200          
##  Class :character   1st Qu.:2.900        1st Qu.: 79.70   Class :character    
##  Mode  :character   Median :4.050        Median : 86.10   Mode  :character    
##                     Mean   :4.095        Mean   : 85.53                       
##                     3rd Qu.:5.100        3rd Qu.: 90.80                       
##                     Max.   :8.800        Max.   :100.00                       
##  kesulitan_ekonomi  jarak_rumah_sekolah motivasi_belajar  
##  Length:200         Min.   : 1.000      Length:200        
##  Class :character   1st Qu.: 5.475      Class :character  
##  Mode  :character   Median : 8.300      Mode  :character  
##                     Mean   : 8.315                        
##                     3rd Qu.:11.300                        
##                     Max.   :20.500

#preprocessing
data_training[c("jenis_kelamin","dukungan_orang_tua","fasilitas_belajar","minat_pada_pelajaran","kesulitan_ekonomi","motivasi_belajar")] <- lapply(data_training[c("jenis_kelamin","dukungan_orang_tua","fasilitas_belajar","minat_pada_pelajaran","kesulitan_ekonomi","motivasi_belajar")],as.factor)
str(data_training)

## tibble [200 × 11] (S3: tbl_df/tbl/data.frame)
##  $ usia                : num [1:200] 15 19 15 15 16 18 16 16 15 21 ...
##  $ jenis_kelamin       : Factor w/ 2 levels "0","1": 1 1 1 2 2 2 2 1 2 2 ...
##  $ nilai_rata_rata     : num [1:200] 85.2 60.9 60.5 78.6 66.2 85.9 45.2 91.6 74.1 88.2 ...
##  $ dukungan_orang_tua  : Factor w/ 3 levels "1","2","3": 1 2 2 3 2 2 1 2 2 3 ...
##  $ fasilitas_belajar   : Factor w/ 3 levels "1","2","3": 2 3 1 2 1 1 3 2 1 3 ...
##  $ jam_belajar_per_hari: num [1:200] 4.1 3.8 6.1 5.1 5.5 3.7 2.8 6.2 4.6 3 ...
##  $ kehadiran_persen    : num [1:200] 79.7 89.8 86.7 78.5 81 85.4 78.9 78.1 96.8 82.1 ...
##  $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 2 1 3 1 2 1 2 2 3 ...
##  $ kesulitan_ekonomi   : Factor w/ 2 levels "0","1": 1 2 2 2 2 2 1 1 1 2 ...
##  $ jarak_rumah_sekolah : num [1:200] 11.5 6.3 15.9 9.8 3.5 1 6.4 4.9 6.5 9.5 ...
##  $ motivasi_belajar    : Factor w/ 3 levels "1","2","3": 1 2 1 2 1 1 1 2 1 3 ...

data_testing[c("jenis_kelamin","dukungan_orang_tua","fasilitas_belajar","minat_pada_pelajaran","kesulitan_ekonomi")] <- lapply(data_testing[c("jenis_kelamin","dukungan_orang_tua","fasilitas_belajar","minat_pada_pelajaran","kesulitan_ekonomi")],as.factor)
str(data_testing)

## tibble [15 × 10] (S3: tbl_df/tbl/data.frame)
##  $ usia                : num [1:15] 15 19 17 20 16 21 21 19 17 18 ...
##  $ jenis_kelamin       : Factor w/ 2 levels "0","1": 2 1 1 2 2 1 1 2 1 1 ...
##  $ nilai_rata_rata     : num [1:15] 90.2 79.6 66.9 85.6 65.9 70.8 86.3 84.3 79.5 74.4 ...
##  $ dukungan_orang_tua  : Factor w/ 3 levels "1","2","3": 2 1 2 3 3 2 3 3 2 1 ...
##  $ fasilitas_belajar   : Factor w/ 3 levels "1","2","3": 2 1 3 1 1 3 2 3 2 2 ...
##  $ jam_belajar_per_hari: num [1:15] 5.4 4.6 2.9 3.9 5.3 1.9 4.4 1.5 2 5.5 ...
##  $ kehadiran_persen    : num [1:15] 78.3 88.2 76.3 89.4 74 100 100 92.1 84.5 90.3 ...
##  $ minat_pada_pelajaran: Factor w/ 3 levels "1","2","3": 2 3 1 1 1 1 1 1 3 3 ...
##  $ kesulitan_ekonomi   : Factor w/ 2 levels "0","1": 2 1 1 1 2 2 1 1 1 1 ...
##  $ jarak_rumah_sekolah : num [1:15] 4.7 10.8 11.8 5.7 3.1 7.1 8.2 5.4 4.2 10.7 ...

#split data
set.seed(123)
index <- createDataPartition(data_training$motivasi_belajar, p=0.8, list=FALSE)
train_set<-data_training[index,]
valid_set<-data_training[-index,]

#latih model SVM
model_svm <- svm(motivasi_belajar~.,data=data_training, kernel="linear")
prediksi_svm <- predict(model_svm, newdata=valid_set)

confusion_matrix <- confusionMatrix(prediksi_svm, valid_set$motivasi_belajar)
print(confusion_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1  9  1  0
##          2  7 20  2
##          3  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.7436          
##                  95% CI : (0.5787, 0.8696)
##     No Information Rate : 0.5385          
##     P-Value [Acc > NIR] : 0.006998        
##                                           
##                   Kappa : 0.4814          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            0.5625   0.9524  0.00000
## Specificity            0.9565   0.5000  1.00000
## Pos Pred Value         0.9000   0.6897      NaN
## Neg Pred Value         0.7586   0.9000  0.94872
## Prevalence             0.4103   0.5385  0.05128
## Detection Rate         0.2308   0.5128  0.00000
## Detection Prevalence   0.2564   0.7436  0.00000
## Balanced Accuracy      0.7595   0.7262  0.50000

#prediksi
svm_pred_test <- predict(model_svm, newdata=data_testing)
data_testing$prediksi_motivasi_belajar <- svm_pred_test
print(data_testing[,c("prediksi_motivasi_belajar")])

## # A tibble: 15 × 1
##    prediksi_motivasi_belajar
##    <fct>                    
##  1 2                        
##  2 2                        
##  3 1                        
##  4 2                        
##  5 1                        
##  6 2                        
##  7 2                        
##  8 2                        
##  9 2                        
## 10 2                        
## 11 1                        
## 12 1                        
## 13 2                        
## 14 1                        
## 15 1

#latih random forest 
model_rf <- randomForest(motivasi_belajar~., data = data_training, ntree=100, mtry=2, importance=TRUE)
prediksi_rf <- predict(model_rf, newdata=valid_set)

confusion_matrix <- confusionMatrix(prediksi_rf,valid_set$motivasi_belajar)
print(confusion_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 16  0  0
##          2  0 21  0
##          3  0  0  2
## 
## Overall Statistics
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9097, 1)
##     No Information Rate : 0.5385     
##     P-Value [Acc > NIR] : 3.274e-11  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            1.0000   1.0000  1.00000
## Specificity            1.0000   1.0000  1.00000
## Pos Pred Value         1.0000   1.0000  1.00000
## Neg Pred Value         1.0000   1.0000  1.00000
## Prevalence             0.4103   0.5385  0.05128
## Detection Rate         0.4103   0.5385  0.05128
## Detection Prevalence   0.4103   0.5385  0.05128
## Balanced Accuracy      1.0000   1.0000  1.00000

#prediksi
rf_pred_test <- predict(model_rf, newdata = data_testing)
data_testing$prediksi_motivasi_belajar <- rf_pred_test
print(data_testing[,c("prediksi_motivasi_belajar")])

## # A tibble: 15 × 1
##    prediksi_motivasi_belajar
##    <fct>                    
##  1 2                        
##  2 2                        
##  3 1                        
##  4 2                        
##  5 1                        
##  6 2                        
##  7 2                        
##  8 2                        
##  9 2                        
## 10 2                        
## 11 1                        
## 12 1                        
## 13 2                        
## 14 2                        
## 15 1

#latih decision tree
model_dt <- rpart(motivasi_belajar~., data = train_set, method= "class")
prediksi_dt <- predict(model_dt, newdata = valid_set, type= "class")
confusion_matrix <- confusionMatrix(prediksi_dt, valid_set$motivasi_belajar)
print(confusion_matrix)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  1  2  3
##          1 10  4  0
##          2  6 17  2
##          3  0  0  0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.6923          
##                  95% CI : (0.5243, 0.8298)
##     No Information Rate : 0.5385          
##     P-Value [Acc > NIR] : 0.0372          
##                                           
##                   Kappa : 0.3938          
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 1 Class: 2 Class: 3
## Sensitivity            0.6250   0.8095  0.00000
## Specificity            0.8261   0.5556  1.00000
## Pos Pred Value         0.7143   0.6800      NaN
## Neg Pred Value         0.7600   0.7143  0.94872
## Prevalence             0.4103   0.5385  0.05128
## Detection Rate         0.2564   0.4359  0.00000
## Detection Prevalence   0.3590   0.6410  0.00000
## Balanced Accuracy      0.7255   0.6825  0.50000

#prediksi
dt_pred_test <- predict(model_dt, newdata = data_testing, type = "class")
data_testing$prediksi_motivasi_belajar <- dt_pred_test
print(data_testing[,c("prediksi_motivasi_belajar")])

## # A tibble: 15 × 1
##    prediksi_motivasi_belajar
##    <fct>                    
##  1 2                        
##  2 2                        
##  3 2                        
##  4 1                        
##  5 1                        
##  6 2                        
##  7 1                        
##  8 2                        
##  9 1                        
## 10 2                        
## 11 1                        
## 12 2                        
## 13 2                        
## 14 2                        
## 15 1

UAS Data Mining

Umniati Kamila

2025-05-28