R04STA1381: Analisis Data Bagian 1

package yang dibutuhkan:

Library

library(tidyverse)
library(kableExtra)
library(ggplot2)
library(ggthemes)
library(stringr)
library(reshape2)
library(mice)
library(nortest)
library(DescTools)
library(caret)
library(rpart)
library(rpart.plot)
library(ROCit)
library(PRROC)
library(ROCR)
library(vip)

Data

Authors UI

library(readxl)
df_authors_ok <- read_excel("C:/SMT  5/Pengantar Sains Data/PSD 4/Tugas/GAB_Prodi_SINTA-_Terbaru_Tanpa unknown.xlsx") 
View(df_authors_ok)

#struktur data sebelum formating
str(df_authors_ok)
## tibble [2,601 x 37] (S3: tbl_df/tbl/data.frame)
##  $ SINTA_ID                     : num [1:2601] 259819 6027567 6059011 5982895 6010622 ...
##  $ Nama                         : chr [1:2601] "ACHMAD NIZAR HIDAYANTO" "ENY KUSRINI" "ZUHERMAN RUSTAM" "MUHAMAD SAHLAN" ...
##  $ Universitas                  : chr [1:2601] "Universitas Indonesia" "Universitas Indonesia" "Universitas Indonesia" "Universitas Indonesia" ...
##  $ Kode_Prodi                   : num [1:2601] 57201 24201 44101 25202 22101 ...
##  $ Departemen                   : chr [1:2601] "S1 - Sistem Informasi" "S1 - Teknik Kimia" "S2 - Matematika" "S1 - Teknologi Bioproses" ...
##  $ Level                        : chr [1:2601] "S1" "S1" "S2" "S1" ...
##  $ Prodi                        : chr [1:2601] "Sistem Informasi" "Teknik Kimia" "Matematika" "Teknologi Bioproses" ...
##  $ SINTA_Score_Overall          : num [1:2601] 8816 7785 3579 4879 6618 ...
##  $ SINTA_Score_3Yr              : num [1:2601] 3547 2755 2570 2317 2239 ...
##  $ Affil_Score                  : num [1:2601] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Affil_Score_3Yr              : num [1:2601] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Scopus_Artikel               : num [1:2601] 370 154 156 177 162 124 124 235 187 98 ...
##  $ Scopus_Citation              : num [1:2601] 1521 1036 809 707 1199 ...
##  $ Scopus_Cited_Document        : num [1:2601] 247 127 116 122 112 53 53 160 134 82 ...
##  $ Scopus_H_Index               : num [1:2601] 17 15 16 14 15 6 6 11 17 14 ...
##  $ Scopus_i10_Index             : num [1:2601] 41 34 28 23 34 3 3 14 34 22 ...
##  $ Scopus_G_Index               : num [1:2601] 1 1 1 1 1 2 2 1 1 1 ...
##  $ GScholar_Artikel             : num [1:2601] 468 233 205 273 243 182 182 400 206 128 ...
##  $ GScholar_Citation            : num [1:2601] 3328 1399 1143 1117 2560 ...
##  $ GScholar_Cited_Document      : num [1:2601] 329 147 135 149 159 93 93 270 132 89 ...
##  $ GScholar_H_Index             : num [1:2601] 26 17 18 18 26 10 10 22 20 16 ...
##  $ GScholar_i10_Index           : num [1:2601] 105 43 42 40 60 12 12 72 40 26 ...
##  $ GScholar_G_Index             : num [1:2601] 1 1 2 1 1 1 1 1 2 1 ...
##  $ WOS_Artikel                  : num [1:2601] 129 99 0 0 68 43 43 0 94 62 ...
##  $ WOS_Citation                 : num [1:2601] 423 785 0 0 661 45 45 0 771 332 ...
##  $ WOS_Cited_Document           : num [1:2601] 91 91 0 0 55 20 20 0 73 47 ...
##  $ Status                       : chr [1:2601] "Aktif" "Aktif" "Aktif" "Aktif" ...
##  $ Akreditasi                   : chr [1:2601] "Unggul" "Unggul" "B" "Unggul" ...
##  $ Jumlah_Dosen_Penghitung_Rasio: num [1:2601] 29 38 14 23 34 29 29 23 35 14 ...
##  $ Jumlah_Dosen_NIDN            : num [1:2601] 16 14 7 7 4 5 5 12 6 7 ...
##  $ Jumlah_Dosen_NIDK            : num [1:2601] 0 1 0 0 1 1 1 0 0 0 ...
##  $ Jumlah_Dosen_Total           : num [1:2601] 16 15 7 7 5 6 6 12 6 7 ...
##  $ Jumlah_Mahasiswa             : num [1:2601] 815 591 55 225 476 206 206 403 91 55 ...
##  $ Rasio_Dosen_per_Mahasiswa    : chr [1:2601] "1 : 28.10" "1 : 15.55" "1 : 3.93" "1 : 9.78" ...
##  $ Rasio_Dosen_Per_Mahasiswa    : num [1:2601] 3.56 6.43 25.45 10.22 7.14 ...
##  $ jumlah_artikel               : num [1:2601] 967 486 361 450 473 349 349 635 487 288 ...
##  $ keterangan                   : chr [1:2601] "Diatas Median" "Diatas Median" "Diatas Median" "Diatas Median" ...

Data Formatting

df_authors_ok$Level <- as.factor(df_authors_ok$Level)
df_authors_ok$Prodi <- as.factor(df_authors_ok$Prodi)
df_authors_ok$Status <- as.factor(df_authors_ok$Status)
df_authors_ok$Akreditasi  <- as.factor(df_authors_ok$Akreditasi)
#struktur data setelah formating
glimpse(df_authors_ok[,c("Level", "Prodi", "Status", "Akreditasi")])
## Rows: 3,556
## Columns: 4
## $ Level      <fct> S1, S1, S2, S1, S2, S2, S2, S2, S2, S2, S3, S3, S2, S2, S1,~
## $ Prodi      <fct> "Sistem Informasi", "Teknik Kimia", "Matematika", "Teknolog~
## $ Status     <fct> Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Akt~
## $ Akreditasi <fct> Unggul, Unggul, B, Unggul, A, Unggul, Baik, A, A, B, Unggul~

Re-Level Factor

levels(df_authors_ok$Level) #level awal
## [1] "D3"      "D4"      "Profesi" "S1"      "S2"      "S3"      "Sp-1"   
## [8] "Sp-2"    "Unknown"
df_authors_ok$Level <- factor(df_authors_ok$Level,levels(df_authors_ok$Level)[c(1,2,5,6,3,4)]) #re-level
levels(df_authors_ok$Level) #setelah re-level
## [1] "D3"      "D4"      "S2"      "S3"      "Profesi" "S1"
levels(df_authors_ok$Akreditasi)
## [1] "-"                   "A"                   "B"                  
## [4] "Baik"                "Baik Sekali"         "Tidak Terakreditasi"
## [7] "Unggul"

Rumpun Ilmu dari Prodi

#Membentuk rumpun ilmu berdasarkan kode prodi 2 digit
df_rumpun <- df_authors_ok %>% 
  select(Kode_Prodi,Prodi)  %>% 
  group_by(Kode_Prodi,Prodi) %>%
  summarize() %>% 
  mutate(Kode_Prodi_2Digit = substr(Kode_Prodi,1,2)) %>%
  na.omit()
## `summarise()` has grouped output by 'Kode_Prodi'. You can override using the `.groups` argument.
df_rumpun
## # A tibble: 256 x 3
## # Groups:   Kode_Prodi [256]
##    Kode_Prodi Prodi                 Kode_Prodi_2Digit
##         <dbl> <fct>                 <chr>            
##  1      11001 Ilmu Kedokteran       11               
##  2      11002 Ilmu Biomedik         11               
##  3      11101 Ilmu Biomedik         11               
##  4      11108 Kedokteran Kerja      11               
##  5      11109 Pendidikan Kedokteran 11               
##  6      11123 Teknologi Biomedis    11               
##  7      11201 Pendidikan Dokter     11               
##  8      11301 Fisioterapi           11               
##  9      11303 Terapi Okupasi        11               
## 10      11401 Fisioterapi           11               
## # ... with 246 more rows
#Membentuk rumpun ilmu berdasarkan kode prodi 2 digit
df_rumpun <- df_rumpun %>% 
  mutate(Rumpun_Ilmu = case_when(Kode_Prodi_2Digit==14 ~ "Kesehatan",
                                 Kode_Prodi_2Digit %in% c(20,21,22,31,55,57) ~ "Teknik",
                                 Kode_Prodi_2Digit %in% c(44:51) ~ "MIPA",
                                 Kode_Prodi_2Digit %in% c(60:63,93) ~ "Ekonomi",
                                 Kode_Prodi_2Digit %in% c(71:79) ~ "Bahasa",
                                 Kode_Prodi_2Digit %in% c(80:89,94,95) ~ "Pendidikan",
                                 Kode_Prodi_2Digit %in% c(90) ~ "Seni,Desain,Media"
                                 ))
df_rumpun
## # A tibble: 256 x 4
## # Groups:   Kode_Prodi [256]
##    Kode_Prodi Prodi                 Kode_Prodi_2Digit Rumpun_Ilmu
##         <dbl> <fct>                 <chr>             <chr>      
##  1      11001 Ilmu Kedokteran       11                <NA>       
##  2      11002 Ilmu Biomedik         11                <NA>       
##  3      11101 Ilmu Biomedik         11                <NA>       
##  4      11108 Kedokteran Kerja      11                <NA>       
##  5      11109 Pendidikan Kedokteran 11                <NA>       
##  6      11123 Teknologi Biomedis    11                <NA>       
##  7      11201 Pendidikan Dokter     11                <NA>       
##  8      11301 Fisioterapi           11                <NA>       
##  9      11303 Terapi Okupasi        11                <NA>       
## 10      11401 Fisioterapi           11                <NA>       
## # ... with 246 more rows
#dataframe rumpun ilmu yang akan digunakan untuk di merge dengan data awal
df_rumpun_oke <- df_rumpun %>% select(Kode_Prodi,Rumpun_Ilmu)

Analisis

#struktur data
glimpse(df_authors_ok)
## Rows: 3,556
## Columns: 40
## $ SINTA_ID                      <dbl> 259819, 6027567, 6059011, 5982895, 60106~
## $ Nama                          <chr> "ACHMAD NIZAR HIDAYANTO", "ENY KUSRINI",~
## $ Universitas                   <chr> "Universitas Indonesia", "Universitas In~
## $ Kode_Prodi                    <dbl> 57201, 24201, 44101, 25202, 22101, 95129~
## $ Departemen                    <chr> "S1 - Sistem Informasi", "S1 - Teknik Ki~
## $ Level                         <fct> S1, S1, S2, S1, S2, S2, S2, S2, S2, S2, ~
## $ Prodi                         <fct> "Sistem Informasi", "Teknik Kimia", "Mat~
## $ SINTA_Score_Overall           <dbl> 8816, 7785, 3579, 4879, 6618, 3367, 3367~
## $ SINTA_Score_3Yr               <dbl> 3547, 2755, 2570, 2317, 2239, 2176, 2176~
## $ Affil_Score                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Affil_Score_3Yr               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Scopus_Artikel                <dbl> 370, 154, 156, 177, 162, 124, 124, 235, ~
## $ Scopus_Citation               <dbl> 1521, 1036, 809, 707, 1199, 147, 147, 64~
## $ Scopus_Cited_Document         <dbl> 247, 127, 116, 122, 112, 53, 53, 160, 13~
## $ Scopus_H_Index                <dbl> 17, 15, 16, 14, 15, 6, 6, 11, 17, 14, 8,~
## $ Scopus_i10_Index              <dbl> 41, 34, 28, 23, 34, 3, 3, 14, 34, 22, 5,~
## $ Scopus_G_Index                <dbl> 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1~
## $ GScholar_Artikel              <dbl> 468, 233, 205, 273, 243, 182, 182, 400, ~
## $ GScholar_Citation             <dbl> 3328, 1399, 1143, 1117, 2560, 382, 382, ~
## $ GScholar_Cited_Document       <dbl> 329, 147, 135, 149, 159, 93, 93, 270, 13~
## $ GScholar_H_Index              <dbl> 26, 17, 18, 18, 26, 10, 10, 22, 20, 16, ~
## $ GScholar_i10_Index            <dbl> 105, 43, 42, 40, 60, 12, 12, 72, 40, 26,~
## $ GScholar_G_Index              <dbl> 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1~
## $ WOS_Artikel                   <dbl> 129, 99, 0, 0, 68, 43, 43, 0, 94, 62, 0,~
## $ WOS_Citation                  <dbl> 423, 785, 0, 0, 661, 45, 45, 0, 771, 332~
## $ WOS_Cited_Document            <dbl> 91, 91, 0, 0, 55, 20, 20, 0, 73, 47, 0, ~
## $ WOS_H_Index                   <dbl> 10, 14, NA, NA, 12, 3, 3, NA, 14, 9, NA,~
## $ WOS_i10_Index                 <dbl> 12, 27, NA, NA, 14, 0, 0, NA, 18, 9, NA,~
## $ WOS_G_Index                   <dbl> 3, 14, NA, NA, 11, 1, 1, NA, 1, 7, NA, N~
## $ Status                        <fct> Aktif, Aktif, Aktif, Aktif, Aktif, Aktif~
## $ Akreditasi                    <fct> Unggul, Unggul, B, Unggul, A, Unggul, Ba~
## $ Jumlah_Dosen_Penghitung_Rasio <dbl> 29, 38, 14, 23, 34, 29, 29, 23, 35, 14, ~
## $ Jumlah_Dosen_NIDN             <dbl> 16, 14, 7, 7, 4, 5, 5, 12, 6, 7, 7, 7, 6~
## $ Jumlah_Dosen_NIDK             <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0~
## $ Jumlah_Dosen_Total            <dbl> 16, 15, 7, 7, 5, 6, 6, 12, 6, 7, 7, 7, 6~
## $ Jumlah_Mahasiswa              <dbl> 815, 591, 55, 225, 476, 206, 206, 403, 9~
## $ Rasio_Dosen_per_Mahasiswa     <chr> "1 : 28.10", "1 : 15.55", "1 : 3.93", "1~
## $ Rasio_Dosen_Per_Mahasiswa     <dbl> 3.558282, 6.429780, 25.454545, 10.222222~
## $ jumlah_artikel                <dbl> 967, 486, 361, 450, 473, 349, 349, 635, ~
## $ keterangan                    <chr> "Diatas Median", "Diatas Median", "Diata~

Data

Unit Observasi = Authors

y = SINTA_Score_3Yr yang dikategorisasi menjadi tinggi dan rendah

x1 = Rumpun Ilmu (Ganjil 2021)

x2 = Level (Ganjil 2021)

x3 = Akreditasi (Ganjil 2021)

x4 = Total Jumlah Dosen (Ganjil 2021)

x5 = Jumlah Mahasiswa (Ganjil 2021)

x6 = Rasio Dosen per Mahasiswa (Ganjil 2021)

data_1 <- df_authors_ok %>%  
  left_join(df_rumpun_oke, by="Kode_Prodi") %>%  
  select(SINTA_Score_3Yr,Prodi,Rumpun_Ilmu,Level,Akreditasi,Jumlah_Dosen_Total,Jumlah_Mahasiswa) %>% 
  mutate(Rasio_Dosen_per_Mahasiswa = df_authors_ok$Jumlah_Dosen_Penghitung_Rasio/df_authors_ok$Jumlah_Mahasiswa,
         y = ifelse(SINTA_Score_3Yr>=239,"1","0")) #kelas 1:SINTA_Score_3Yr yang tinggi
data_1$y <- as.factor(data_1$y)
data_1$Rumpun_Ilmu <- as.factor(data_1$Rumpun_Ilmu)
summary(data_1)
##  SINTA_Score_3Yr                      Prodi          Rumpun_Ilmu  
##  Min.   :   0.0   Ilmu Hukum             :  88   Bahasa    : 422  
##  1st Qu.:   1.0   Pendidikan Dokter      :  70   Ekonomi   : 382  
##  Median :  37.0   Psikologi              :  67   MIPA      : 188  
##  Mean   : 141.1   Obstetri dan Ginekologi:  64   Teknik    : 187  
##  3rd Qu.: 161.0   Akuntansi              :  57   Pendidikan: 135  
##  Max.   :3547.0   (Other)                :2255   (Other)   :  54  
##                   NA's                   : 955   NA's      :2188  
##      Level        Akreditasi   Jumlah_Dosen_Total Jumlah_Mahasiswa
##  D3     :  62   A      :1538   Min.   : 0.00      Min.   :   0.0  
##  D4     :  18   Unggul : 743   1st Qu.: 6.00      1st Qu.:  66.0  
##  S2     : 637   -      : 140   Median :10.00      Median : 213.0  
##  S3     : 269   B      : 110   Mean   :16.85      Mean   : 375.3  
##  Profesi:  79   Baik   :  55   3rd Qu.:19.00      3rd Qu.: 503.0  
##  S1     : 985   (Other):  35   Max.   :71.00      Max.   :2290.0  
##  NA's   :1506   NA's   : 935   NA's   :935        NA's   :935     
##  Rasio_Dosen_per_Mahasiswa y       
##  Min.   :0.0123            0:2915  
##  1st Qu.:0.0666            1: 641  
##  Median :0.1346                    
##  Mean   :0.2373                    
##  3rd Qu.:0.3490                    
##  Max.   :1.4000                    
##  NA's   :990
#Cek missing values
md.pattern(data_1,rotate.names = TRUE)

##      SINTA_Score_3Yr y Akreditasi Jumlah_Dosen_Total Jumlah_Mahasiswa Prodi
## 1325               1 1          1                  1                1     1
## 710                1 1          1                  1                1     1
## 502                1 1          1                  1                1     1
## 15                 1 1          1                  1                1     1
## 40                 1 1          1                  1                1     1
## 28                 1 1          1                  1                1     0
## 1                  1 1          1                  1                1     0
## 9                  1 1          0                  0                0     1
## 926                1 1          0                  0                0     0
##                    0 0        935                935              935   955
##      Rasio_Dosen_per_Mahasiswa Level Rumpun_Ilmu     
## 1325                         1     1           1    0
## 710                          1     1           0    1
## 502                          1     0           0    2
## 15                           0     1           1    1
## 40                           0     0           0    3
## 28                           1     0           1    2
## 1                            1     0           0    3
## 9                            0     0           0    6
## 926                          0     0           0    7
##                            990  1506        2188 8444
data_1 <- data_1 %>% filter(!is.na(Prodi),!is.na(Level),!is.na(Akreditasi),!is.na(Rumpun_Ilmu),
                            Rasio_Dosen_per_Mahasiswa!=Inf)
head(data_1) #data yang akan digunakan
## # A tibble: 6 x 9
##   SINTA_Score_3Yr Prodi           Rumpun_Ilmu Level Akreditasi Jumlah_Dosen_Tot~
##             <dbl> <fct>           <fct>       <fct> <fct>                  <dbl>
## 1            3547 Sistem Informa~ Teknik      S1    Unggul                    16
## 2            2570 Matematika      MIPA        S2    B                          7
## 3            2239 Teknik Sipil    Teknik      S2    A                          5
## 4            2176 Ilmu Lingkungan Pendidikan  S2    Unggul                     6
## 5            2176 Ilmu Lingkungan Pendidikan  S2    Baik                       6
## 6            2007 Teknologi Info~ Teknik      S2    A                         12
## # ... with 3 more variables: Jumlah_Mahasiswa <dbl>,
## #   Rasio_Dosen_per_Mahasiswa <dbl>, y <fct>
summary(data_1)
##  SINTA_Score_3Yr            Prodi                Rumpun_Ilmu      Level    
##  Min.   :   0.0   Ilmu Hukum   : 88   Bahasa           :422   D3     : 34  
##  1st Qu.:   3.0   Psikologi    : 67   Ekonomi          :366   D4     : 12  
##  Median :  41.0   Akuntansi    : 57   Kesehatan        : 44   S2     :413  
##  Mean   : 165.8   Ilmu Ekonomi : 56   MIPA             :188   S3     :168  
##  3rd Qu.: 175.0   Manajemen    : 43   Pendidikan       :135   Profesi: 24  
##  Max.   :3547.0   Ilmu Komputer: 39   Seni,Desain,Media:  9   S1     :674  
##                   (Other)      :975   Teknik           :161                
##                Akreditasi  Jumlah_Dosen_Total Jumlah_Mahasiswa
##  -                  : 48   Min.   : 2.00      Min.   :   8.0  
##  A                  :717   1st Qu.: 6.00      1st Qu.: 115.0  
##  B                  : 34   Median :11.00      Median : 311.0  
##  Baik               : 21   Mean   :18.31      Mean   : 523.8  
##  Baik Sekali        : 27   3rd Qu.:20.00      3rd Qu.: 643.0  
##  Tidak Terakreditasi:  8   Max.   :71.00      Max.   :2290.0  
##  Unggul             :470                                      
##  Rasio_Dosen_per_Mahasiswa y       
##  Min.   :0.01235           0:1062  
##  1st Qu.:0.05643           1: 263  
##  Median :0.07849                   
##  Mean   :0.14571                   
##  3rd Qu.:0.17647                   
##  Max.   :0.80000                   
## 

EDA Data

Peubah Respon (y)

#format data yang dibutuhkan
data_chart <- data_1 %>% 
  group_by(y) %>%  
  summarize(value=n()) %>%
  mutate(prop = round(value / sum(value) *100, digits = 2))

#pie chart: Sebaran Authors Berdasarkan Kategori SINTA_Score_3Yr
ggplot(data_chart, aes(x="", y=prop, fill=y)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  labs(title= "Proporsi Authors Menurut Kategori SINTA_Score_3Yr",
       subtitle = "Universitas Indonesia") +
  theme_void() 

Peubah Prediktor (X) Numerik

#Density Jumlah_Dosen_Total
ggplot(data_1, aes(x=Jumlah_Dosen_Total)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=15)+
  theme_light() +
  labs(x="Jumlah_Dosen_Total",
       y="Density",
       title= "Sebaran Jumlah_Dosen_Total",
       subtitle = "Universitas Indonesia") 

#Density Jumlah_Mahasiswa
ggplot(data_1, aes(x=Jumlah_Mahasiswa)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=15)+
  theme_light() +
  labs(x="Jumlah_Mahasiswa",
       y="Density",
       title= "Sebaran Jumlah_Mahasiswa",
       subtitle = "Universitas Indonesia") 

#Density Rasio_Dosen_per_Mahasiswa
ggplot(data_1, aes(x=Rasio_Dosen_per_Mahasiswa)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=20)+
  theme_light() +
  labs(x="Rasio_Dosen_per_Mahasiswa",
       y="Density",
       title= "Sebaran Rasio_Dosen_per_Mahasiswa",
       subtitle = "Universitas Indonesia") 

### Peubah Prediktor (X) Kategorik

# Akreditasi
data_bar_chart = data_1 %>%  
  group_by(Akreditasi)%>% 
  summarize(Jumlah=n())

ggplot(data_bar_chart, aes(x=Akreditasi, y=Jumlah)) + 
  geom_bar(stat = "identity",color="steelblue") +
  theme_light() +
  labs(x="",
       y="",
       title= "",
       subtitle = "Universitas Indonesia") +
  coord_flip()

# Level
data_bar_chart = data_1 %>%  
  group_by(Level)%>% 
  summarize(Jumlah=n())

ggplot(data_bar_chart, aes(x=Level, y=Jumlah)) + 
  geom_bar(stat = "identity",color="steelblue") +
  theme_light() +
  labs(x="",
       y="",
       title= "",
       subtitle = "Universitas Indonesia") +
  coord_flip()

# Rumpun Ilmu
data_bar_chart = data_1 %>%  
  group_by(Rumpun_Ilmu)%>% 
  summarize(Jumlah=n())

ggplot(data_bar_chart, aes(x=(Rumpun_Ilmu), y=Jumlah)) + 
  geom_bar(stat = "identity",color="steelblue") +
  theme_light() +
  labs(x="",
       y="",
       title= "",
       subtitle = "Universitas Indonesia") +
  coord_flip()

Hubungan Peubah Prediktor dengan Peubah Respon

# Akreditasi & y
percentData <- data_1 %>% 
  group_by(Akreditasi) %>% 
  count(y) %>% 
  mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Akreditasi),fill=y,))+
    geom_bar(position="fill")+
    scale_fill_manual(values=c("#7be217", "#4f58ab"))+ 
    geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
    labs( 
       y = "", 
       x = "Akreditasi", 
       subtitle = "UI",
       title = "Proporsi Peubah Respon Menurut Akreditasi")

# Level & y
percentData <- data_1 %>% 
  group_by(Level) %>% 
  count(y) %>% 
  mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Level),fill=y,))+
    geom_bar(position="fill")+
    scale_fill_manual(values=c("#7be217", "#4f58ab"))+ 
    geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
    labs( 
       y = "", 
       x = "Level", 
       subtitle = "UI",
       title = "Proporsi Peubah Respon Menurut Level")

# Rumpun_Ilmu & y
percentData <- data_1 %>% 
  group_by(Rumpun_Ilmu) %>% 
  count(y) %>% 
  mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Rumpun_Ilmu),fill=y,))+
    geom_bar(position="fill")+
    scale_fill_manual(values=c("#7be217", "#4f58ab"))+ 
    geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
    labs( 
       y = "", 
       x = "Rumpun_Ilmu", 
       subtitle = "UI",
       title = "Proporsi Peubah Respon Menurut Rumpun Ilmu")

# Jumlah_Mahasiswa & y

#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Jumlah_Mahasiswa,fill=Jumlah_Mahasiswa,alpha=Jumlah_Mahasiswa)) + 
  geom_boxplot(fill="#69b3a2",  alpha=0.8) +
  theme_light() +
  labs(x="Jumlah_Mahasiswa",
       y="y",
       title= "Sebaran Jumlah Mahasiswa Menurut Peubah Respon",
       subtitle = "Universitas Indonesia") 

# Jumlah_Dosen_Total & y

#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Jumlah_Dosen_Total,fill=Jumlah_Dosen_Total,alpha=Jumlah_Dosen_Total)) + 
  geom_boxplot(fill="#69b3a2",  alpha=0.8) +
  theme_light() +
  labs(x="Jumlah_Dosen_Total",
       y="y",
       title= "Sebaran Jumlah Dosen Total Menurut Peubah Respon",
       subtitle = "Universitas Indonesia") 

# Rasio_Dosen_per_Mahasiswa & y

#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Rasio_Dosen_per_Mahasiswa,fill=Rasio_Dosen_per_Mahasiswa,alpha=Rasio_Dosen_per_Mahasiswa)) + 
  geom_boxplot(fill="#69b3a2",  alpha=0.8) +
  theme_light() +
  labs(x="Rasio_Dosen_per_Mahasiswa",
       y="y",
       title= "Sebaran Rasio Dosen per Mahasiswa Menurut Peubah Respon",
       subtitle = "Universitas Indonesia") 

Data Model

#data yang akan digunakan untuk model
data_sinta <- data_1 %>% select(-c(SINTA_Score_3Yr,Prodi))
str(data_sinta)
## tibble [2,035 x 7] (S3: tbl_df/tbl/data.frame)
##  $ Rumpun_Ilmu              : Factor w/ 7 levels "Bahasa","Ekonomi",..: 7 NA 4 NA 7 5 5 7 7 4 ...
##  $ Level                    : Factor w/ 6 levels "D3","D4","S2",..: 6 6 3 6 3 3 3 3 3 3 ...
##  $ Akreditasi               : Factor w/ 7 levels "-","A","B","Baik",..: 7 7 3 7 2 7 4 2 2 3 ...
##  $ Jumlah_Dosen_Total       : num [1:2035] 16 15 7 7 5 6 6 12 6 7 ...
##  $ Jumlah_Mahasiswa         : num [1:2035] 815 591 55 225 476 206 206 403 91 55 ...
##  $ Rasio_Dosen_per_Mahasiswa: num [1:2035] 0.0356 0.0643 0.2545 0.1022 0.0714 ...
##  $ y                        : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

Splitting Data

set.seed(478)
in.train <- createDataPartition(as.factor(data_sinta$y),p=0.7,list=F) #partisi data
data_sinta_train <- data_sinta[in.train,] #data training utk modelling
data_sinta_test<- data_sinta[-in.train,] #data testing utk evaluasi model

#proporsi kelas peubah respon pada data
round(prop.table(table(data_sinta_train$y)), digits = 4)
## 
##      0      1 
## 0.7707 0.2293
round(prop.table(table(data_sinta_test$y)), digits = 4)
## 
##      0      1 
## 0.7718 0.2282

Regresi Logistik

Semua Peubah

model_reglog_1 <- glm(y~., data_sinta_train, family=binomial())
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_reglog_1)
## 
## Call:
## glm(formula = y ~ ., family = binomial(), data = data_sinta_train)
## 
## Deviance Residuals: 
##      Min        1Q    Median        3Q       Max  
## -2.05878  -0.43306  -0.15604  -0.06321   2.97138  
## 
## Coefficients:
##                                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -1.902e+01  8.340e+02  -0.023 0.981806    
## Rumpun_IlmuEkonomi             2.162e+00  7.745e-01   2.792 0.005237 ** 
## Rumpun_IlmuKesehatan           4.345e+00  8.479e-01   5.124 2.98e-07 ***
## Rumpun_IlmuMIPA                5.293e+00  7.724e-01   6.852 7.30e-12 ***
## Rumpun_IlmuPendidikan          3.004e+00  8.030e-01   3.741 0.000183 ***
## Rumpun_IlmuSeni,Desain,Media  -1.149e+01  1.601e+03  -0.007 0.994270    
## Rumpun_IlmuTeknik              5.390e+00  7.855e-01   6.862 6.79e-12 ***
## LevelD4                        4.401e-01  1.594e+03   0.000 0.999780    
## LevelS2                        1.546e+01  8.340e+02   0.019 0.985205    
## LevelS3                        1.585e+01  8.340e+02   0.019 0.984835    
## LevelProfesi                   1.240e+01  8.340e+02   0.015 0.988133    
## LevelS1                        1.465e+01  8.340e+02   0.018 0.985985    
## AkreditasiA                   -4.600e-01  6.152e-01  -0.748 0.454685    
## AkreditasiB                    6.867e-01  8.626e-01   0.796 0.426004    
## AkreditasiBaik                -1.372e-01  8.411e-01  -0.163 0.870405    
## AkreditasiBaik Sekali         -9.258e-01  9.386e-01  -0.986 0.323953    
## AkreditasiTidak Terakreditasi -1.713e+00  1.112e+00  -1.540 0.123615    
## AkreditasiUnggul              -7.209e-01  5.653e-01  -1.275 0.202204    
## Jumlah_Dosen_Total             7.843e-03  3.401e-02   0.231 0.817616    
## Jumlah_Mahasiswa              -8.011e-04  1.104e-03  -0.726 0.468101    
## Rasio_Dosen_per_Mahasiswa     -2.117e+00  1.165e+00  -1.817 0.069213 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 898.54  on 920  degrees of freedom
## Residual deviance: 549.08  on 900  degrees of freedom
##   (505 observations deleted due to missingness)
## AIC: 591.08
## 
## Number of Fisher Scoring iterations: 16
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_reglog_1, data_sinta_train, type = "response")
prediksi_data_train <- as.factor(ifelse(prediksi_prob_data_train > 0.5,"1","0"))
eval_reglog_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_reglog_1_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 702  91
##          1  43  85
##                                           
##                Accuracy : 0.8545          
##                  95% CI : (0.8301, 0.8767)
##     No Information Rate : 0.8089          
##     P-Value [Acc > NIR] : 0.0001675       
##                                           
##                   Kappa : 0.4747          
##                                           
##  Mcnemar's Test P-Value : 4.903e-05       
##                                           
##             Sensitivity : 0.48295         
##             Specificity : 0.94228         
##          Pos Pred Value : 0.66406         
##          Neg Pred Value : 0.88525         
##              Prevalence : 0.19110         
##          Detection Rate : 0.09229         
##    Detection Prevalence : 0.13898         
##       Balanced Accuracy : 0.71262         
##                                           
##        'Positive' Class : 1               
## 

Sensitivity: kemampuan model dalam memprediksi kelas positif

Specificity: kemampuan model dalam memprediksi kelas negatif

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_reglog_1, data_sinta_test, type = "response")
prediksi_data_test <- as.factor(ifelse(prediksi_prob_data_test > 0.5,"1","0"))
eval_reglog_1 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_reglog_1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 300  43
##          1  17  44
##                                          
##                Accuracy : 0.8515         
##                  95% CI : (0.813, 0.8847)
##     No Information Rate : 0.7847         
##     P-Value [Acc > NIR] : 0.000428       
##                                          
##                   Kappa : 0.5071         
##                                          
##  Mcnemar's Test P-Value : 0.001249       
##                                          
##             Sensitivity : 0.5057         
##             Specificity : 0.9464         
##          Pos Pred Value : 0.7213         
##          Neg Pred Value : 0.8746         
##              Prevalence : 0.2153         
##          Detection Rate : 0.1089         
##    Detection Prevalence : 0.1510         
##       Balanced Accuracy : 0.7261         
##                                          
##        'Positive' Class : 1              
## 

Performa model pada data training dan data testing perlu diperhatikan untuk mengetahui adanya overfiting/underfiting

Overfiting terjadi ketika performa model pada data training jauh lebih tinggi jika dibandingkan dengan performa model pada data testing (mempelajari data terlalu baik)

Underfiting terjadi ketika performa model pada data testing jauh lebih tinggi jika dibandingkan dengan performa model pada data training (tidak mempelajari data dengan baik)

#fungsi utk membentuk plot ROC
rocplot=function(pred,truth, ...){
  predob=ROCR::prediction(pred,truth)
  perf=ROCR::performance(predob,"tpr","fpr")
  auc=ROCR::performance(predob,"auc")@y.values
  plot(perf,main = auc)
  
}

NA.OMIT

#df_authors_UI <- na.omit(prediksi_prob_data_train)

#ROC data training rocplot(prediksi_prob_data_train,data_sinta_train$y)

data_sinta_test
## # A tibble: 609 x 7
##    Rumpun_Ilmu Level Akreditasi Jumlah_Dosen_Total Jumlah_Mahasiswa
##    <fct>       <fct> <fct>                   <dbl>            <dbl>
##  1 Teknik      S1    Unggul                     16              815
##  2 MIPA        S2    B                           7               55
##  3 <NA>        S1    Unggul                      7              225
##  4 Teknik      S3    Unggul                      7               43
##  5 MIPA        S3    Unggul                      7               34
##  6 Teknik      S2    A                          12              403
##  7 MIPA        S2    A                           6              127
##  8 Teknik      S3    Unggul                      6               87
##  9 Teknik      S3    Unggul                      6               87
## 10 Teknik      S3    Unggul                      6               51
## # ... with 599 more rows, and 2 more variables:
## #   Rasio_Dosen_per_Mahasiswa <dbl>, y <fct>

#ROC data testing

rocplot(prediksi_prob_data_test,data_sinta_test$y)

#variable importance
vip(model_reglog_1, num_features = 50)

Seleksi Peubah

model_reglog_2 <- glm(y~Level+Jumlah_Mahasiswa , data_sinta, family=binomial())
summary(model_reglog_2)
## 
## Call:
## glm(formula = y ~ Level + Jumlah_Mahasiswa, family = binomial(), 
##     data = data_sinta)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9858  -0.7647  -0.6192  -0.1815   2.8854  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.8822017  1.0087099  -3.849 0.000119 ***
## LevelD4           1.1244597  1.4406433   0.781 0.435081    
## LevelS2           3.3173996  1.0117302   3.279 0.001042 ** 
## LevelS3           3.4221448  1.0161952   3.368 0.000758 ***
## LevelProfesi      2.3077538  1.0636234   2.170 0.030029 *  
## LevelS1           2.9852008  1.0140145   2.944 0.003241 ** 
## Jumlah_Mahasiswa -0.0011125  0.0001943  -5.726 1.03e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2189.9  on 2034  degrees of freedom
## Residual deviance: 2044.6  on 2028  degrees of freedom
## AIC: 2058.6
## 
## Number of Fisher Scoring iterations: 6
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_reglog_2, data_sinta_train, type = "response")
prediksi_data_train <- as.factor(ifelse(prediksi_prob_data_train > 0.5,"1","0"))
eval_reglog_2_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
## Warning in confusionMatrix.default(prediksi_data_train, data_sinta_train$y, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
eval_reglog_2_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1099  327
##          1    0    0
##                                          
##                Accuracy : 0.7707         
##                  95% CI : (0.748, 0.7923)
##     No Information Rate : 0.7707         
##     P-Value [Acc > NIR] : 0.5148         
##                                          
##                   Kappa : 0              
##                                          
##  Mcnemar's Test P-Value : <2e-16         
##                                          
##             Sensitivity : 0.0000         
##             Specificity : 1.0000         
##          Pos Pred Value :    NaN         
##          Neg Pred Value : 0.7707         
##              Prevalence : 0.2293         
##          Detection Rate : 0.0000         
##    Detection Prevalence : 0.0000         
##       Balanced Accuracy : 0.5000         
##                                          
##        'Positive' Class : 1              
## 
rocplot(prediksi_prob_data_train,data_sinta_train$y) 

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_reglog_2, data_sinta_test, type = "response")
prediksi_data_test <- as.factor(ifelse(prediksi_prob_data_test > 0.5,"1","0"))
eval_reglog_2 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
## Warning in confusionMatrix.default(prediksi_data_test, data_sinta_test$y, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
eval_reglog_2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 470 139
##          1   0   0
##                                           
##                Accuracy : 0.7718          
##                  95% CI : (0.7363, 0.8045)
##     No Information Rate : 0.7718          
##     P-Value [Acc > NIR] : 0.5227          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.7718          
##              Prevalence : 0.2282          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
## 
rocplot(prediksi_prob_data_test,data_sinta_test$y)

vip(model_reglog_2, num_features = 50)

Classification Tree

Model 1 Default

Model dengan hyperparameter minsplit dan cp default

model_tree_1 <- rpart(y ~., data = data_sinta_train, method = "class",
               control=rpart.control(minsplit = 20, cp=0))
rpart.plot(model_tree_1, extra = 4)

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_1, newdata=data_sinta_train, type = "class") 
eval_tree_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_1_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1017  111
##          1   82  216
##                                          
##                Accuracy : 0.8647         
##                  95% CI : (0.8458, 0.882)
##     No Information Rate : 0.7707         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.6048         
##                                          
##  Mcnemar's Test P-Value : 0.04385        
##                                          
##             Sensitivity : 0.6606         
##             Specificity : 0.9254         
##          Pos Pred Value : 0.7248         
##          Neg Pred Value : 0.9016         
##              Prevalence : 0.2293         
##          Detection Rate : 0.1515         
##    Detection Prevalence : 0.2090         
##       Balanced Accuracy : 0.7930         
##                                          
##        'Positive' Class : 1              
## 
rocplot(prediksi_prob_data_train[,2],data_sinta_train$y) 

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_1, newdata=data_sinta_test, type = "class") 
eval_tree_1 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 424  61
##          1  46  78
##                                           
##                Accuracy : 0.8243          
##                  95% CI : (0.7917, 0.8537)
##     No Information Rate : 0.7718          
##     P-Value [Acc > NIR] : 0.0008942       
##                                           
##                   Kappa : 0.4816          
##                                           
##  Mcnemar's Test P-Value : 0.1759180       
##                                           
##             Sensitivity : 0.5612          
##             Specificity : 0.9021          
##          Pos Pred Value : 0.6290          
##          Neg Pred Value : 0.8742          
##              Prevalence : 0.2282          
##          Detection Rate : 0.1281          
##    Detection Prevalence : 0.2036          
##       Balanced Accuracy : 0.7316          
##                                           
##        'Positive' Class : 1               
## 
rocplot(prediksi_prob_data_test[,2],data_sinta_test$y)

vip(model_tree_1, num_features = 50)

Model 2

Model dengan hyperparameter minsplit dan cp yang ditentukan sendiri (minsplit=10 dan cp=0)

model_tree_2 <- rpart(y ~., data = data_sinta_train, method = "class",
               control=rpart.control(minsplit = 10, cp=0))
rpart.plot(model_tree_2)

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_2, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_2, newdata=data_sinta_train, type = "class") 
eval_tree_2_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_2_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1027  119
##          1   72  208
##                                           
##                Accuracy : 0.8661          
##                  95% CI : (0.8473, 0.8833)
##     No Information Rate : 0.7707          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.6009          
##                                           
##  Mcnemar's Test P-Value : 0.0008733       
##                                           
##             Sensitivity : 0.6361          
##             Specificity : 0.9345          
##          Pos Pred Value : 0.7429          
##          Neg Pred Value : 0.8962          
##              Prevalence : 0.2293          
##          Detection Rate : 0.1459          
##    Detection Prevalence : 0.1964          
##       Balanced Accuracy : 0.7853          
##                                           
##        'Positive' Class : 1               
## 
ROC_model_tree_2_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_tree_2_train)

ROC_model_tree_2_train$AUC
## [1] 0.8940725
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_2, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_2, newdata=data_sinta_test, type = "class") 
eval_tree_2 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_2
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 440  66
##          1  30  73
##                                          
##                Accuracy : 0.8424         
##                  95% CI : (0.811, 0.8704)
##     No Information Rate : 0.7718         
##     P-Value [Acc > NIR] : 9.855e-06      
##                                          
##                   Kappa : 0.5076         
##                                          
##  Mcnemar's Test P-Value : 0.000354       
##                                          
##             Sensitivity : 0.5252         
##             Specificity : 0.9362         
##          Pos Pred Value : 0.7087         
##          Neg Pred Value : 0.8696         
##              Prevalence : 0.2282         
##          Detection Rate : 0.1199         
##    Detection Prevalence : 0.1691         
##       Balanced Accuracy : 0.7307         
##                                          
##        'Positive' Class : 1              
## 
ROC_model_tree_2 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_tree_2)

ROC_model_tree_2$AUC
## [1] 0.8301852
vip(model_tree_2, num_features = 50)

Model 3 Tuning Minsplit

Model dengan hyperparameter minsplit optimum

#mencari minsplit optimum
set.seed(478)
akurasi.semua <- NULL

for(ulangan in 1:100){
  acak <- createDataPartition(data_sinta$y, p=0.7, list=FALSE)
  data_sinta_train <- data_sinta[acak,]
  data_sinta_test <- data_sinta[-acak,]

  for (k in 1:30){
  pohon <- rpart(y ~ ., 
                 data=data_sinta_train,
                 method='class',
                 control=rpart.control(minsplit = k, cp=0))
  prediksi.prob <- predict(pohon, data_sinta_test)
  prediksi <- ifelse(prediksi.prob > 0.5, "1", "0")[,2]
  akurasi <- mean(prediksi == data_sinta_test$y)
  akurasi.semua <- rbind(akurasi.semua, c(k, akurasi))
  }
}
mean.akurasi <- tapply(akurasi.semua[,2], akurasi.semua[,1], mean)
plot(names(mean.akurasi),mean.akurasi, type="b", xlab="minsplit", ylab="rata-rata akurasi data testing")

model_tree_3 <- rpart(y ~., data = data_sinta_train, method = "class",
               control=rpart.control(minsplit = 11, cp=0))
rpart.plot(model_tree_3, extra=4)
## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_3, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_3, newdata=data_sinta_train, type = "class") 
eval_tree_3_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_3_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1033  120
##          1   66  207
##                                          
##                Accuracy : 0.8696         
##                  95% CI : (0.851, 0.8866)
##     No Information Rate : 0.7707         
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.6083         
##                                          
##  Mcnemar's Test P-Value : 0.0001018      
##                                          
##             Sensitivity : 0.6330         
##             Specificity : 0.9399         
##          Pos Pred Value : 0.7582         
##          Neg Pred Value : 0.8959         
##              Prevalence : 0.2293         
##          Detection Rate : 0.1452         
##    Detection Prevalence : 0.1914         
##       Balanced Accuracy : 0.7865         
##                                          
##        'Positive' Class : 1              
## 
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_3, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_3, newdata=data_sinta_test, type = "class") 
eval_tree_3 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_3
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 421  63
##          1  49  76
##                                          
##                Accuracy : 0.8161         
##                  95% CI : (0.783, 0.8461)
##     No Information Rate : 0.7718         
##     P-Value [Acc > NIR] : 0.004493       
##                                          
##                   Kappa : 0.4588         
##                                          
##  Mcnemar's Test P-Value : 0.219303       
##                                          
##             Sensitivity : 0.5468         
##             Specificity : 0.8957         
##          Pos Pred Value : 0.6080         
##          Neg Pred Value : 0.8698         
##              Prevalence : 0.2282         
##          Detection Rate : 0.1248         
##    Detection Prevalence : 0.2053         
##       Balanced Accuracy : 0.7213         
##                                          
##        'Positive' Class : 1              
## 
vip(model_tree_3, num_features = 50)

Model 4 Opsi CP

Model dengan hyperparameter cp optimum

set.seed(478)
model_tree_4 <- rpart(y ~ ., data=data_sinta_train,
               method='class',
               control=rpart.control(minsplit = 20, cp=0))
printcp(model_tree_4)
## 
## Classification tree:
## rpart(formula = y ~ ., data = data_sinta_train, method = "class", 
##     control = rpart.control(minsplit = 20, cp = 0))
## 
## Variables actually used in tree construction:
## [1] Akreditasi                Jumlah_Dosen_Total       
## [3] Jumlah_Mahasiswa          Level                    
## [5] Rasio_Dosen_per_Mahasiswa Rumpun_Ilmu              
## 
## Root node error: 327/1426 = 0.22931
## 
## n= 1426 
## 
##          CP nsplit rel error  xerror     xstd
## 1 0.0749235      0   1.00000 1.00000 0.048547
## 2 0.0183486      2   0.85015 0.88073 0.046362
## 3 0.0081549      7   0.75841 0.82875 0.045307
## 4 0.0061162     14   0.67890 0.78593 0.044388
## 5 0.0053517     16   0.66667 0.79205 0.044522
## 6 0.0030581     20   0.64526 0.77982 0.044253
## 7 0.0022936     21   0.64220 0.74924 0.043561
## 8 0.0010194     30   0.60245 0.75229 0.043632
## 9 0.0000000     33   0.59939 0.75229 0.043632
model_tree_4 <- rpart(y ~ ., data=data_sinta_train,
               method='class',
               control=rpart.control(minsplit = 20, cp=0.0066225))
rpart.plot(model_tree_4)

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_4, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_4, newdata=data_sinta_train, type = "class") 
eval_tree_4_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_4_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1037  160
##          1   62  167
##                                           
##                Accuracy : 0.8443          
##                  95% CI : (0.8244, 0.8628)
##     No Information Rate : 0.7707          
##     P-Value [Acc > NIR] : 3.191e-12       
##                                           
##                   Kappa : 0.5077          
##                                           
##  Mcnemar's Test P-Value : 7.504e-11       
##                                           
##             Sensitivity : 0.5107          
##             Specificity : 0.9436          
##          Pos Pred Value : 0.7293          
##          Neg Pred Value : 0.8663          
##              Prevalence : 0.2293          
##          Detection Rate : 0.1171          
##    Detection Prevalence : 0.1606          
##       Balanced Accuracy : 0.7271          
##                                           
##        'Positive' Class : 1               
## 
ROC_model_tree_4_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_tree_4_train)

ROC_model_tree_4_train$AUC
## [1] 0.7733511
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_4, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_4, newdata=data_sinta_test, type = "class") 
eval_tree_4 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_4
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 439  76
##          1  31  63
##                                           
##                Accuracy : 0.8243          
##                  95% CI : (0.7917, 0.8537)
##     No Information Rate : 0.7718          
##     P-Value [Acc > NIR] : 0.0008942       
##                                           
##                   Kappa : 0.4371          
##                                           
##  Mcnemar's Test P-Value : 2.103e-05       
##                                           
##             Sensitivity : 0.4532          
##             Specificity : 0.9340          
##          Pos Pred Value : 0.6702          
##          Neg Pred Value : 0.8524          
##              Prevalence : 0.2282          
##          Detection Rate : 0.1034          
##    Detection Prevalence : 0.1544          
##       Balanced Accuracy : 0.6936          
##                                           
##        'Positive' Class : 1               
## 
ROC_model_tree_4 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_tree_4)

ROC_model_tree_4$AUC
## [1] 0.7244987
vip(model_tree_4, num_features = 50)

Bagging

Model Default

Model dengan hyperparameter nbagg default dan tree default

model_bag_1 <- ipred::bagging(y ~ ., data=data_sinta_train, coob = TRUE,
                              nbagg=25, 
                              control= rpart.control(minsplit=2, cp=0))
model_bag_1
## 
## Bagging classification trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = y ~ ., data = data_sinta_train, 
##     coob = TRUE, nbagg = 25, control = rpart.control(minsplit = 2, 
##         cp = 0))
## 
## Out-of-bag estimate of misclassification error:  0.1198
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_bag_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_bag_1, data_sinta_train,type="class")
eval_model_bag_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_model_bag_1_train
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 1056  194
##          1   43  133
##                                           
##                Accuracy : 0.8338          
##                  95% CI : (0.8134, 0.8528)
##     No Information Rate : 0.7707          
##     P-Value [Acc > NIR] : 2.586e-09       
##                                           
##                   Kappa : 0.4388          
##                                           
##  Mcnemar's Test P-Value : < 2.2e-16       
##                                           
##             Sensitivity : 0.40673         
##             Specificity : 0.96087         
##          Pos Pred Value : 0.75568         
##          Neg Pred Value : 0.84480         
##              Prevalence : 0.22931         
##          Detection Rate : 0.09327         
##    Detection Prevalence : 0.12342         
##       Balanced Accuracy : 0.68380         
##                                           
##        'Positive' Class : 1               
## 
ROC_model_bag_1_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_bag_1_train)

ROC_model_bag_1_train$AUC
## [1] 0.7982667
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_bag_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_bag_1, data_sinta_test,type="class")
eval_model_bag_1<- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_model_bag_1
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 444  89
##          1  26  50
##                                           
##                Accuracy : 0.8112          
##                  95% CI : (0.7778, 0.8415)
##     No Information Rate : 0.7718          
##     P-Value [Acc > NIR] : 0.01047         
##                                           
##                   Kappa : 0.3622          
##                                           
##  Mcnemar's Test P-Value : 7.402e-09       
##                                           
##             Sensitivity : 0.3597          
##             Specificity : 0.9447          
##          Pos Pred Value : 0.6579          
##          Neg Pred Value : 0.8330          
##              Prevalence : 0.2282          
##          Detection Rate : 0.0821          
##    Detection Prevalence : 0.1248          
##       Balanced Accuracy : 0.6522          
##                                           
##        'Positive' Class : 1               
## 
ROC_model_bag_1 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_bag_1)

ROC_model_bag_1$AUC
## [1] 0.7415429

Random Forest

Model 1 Default

Model dengan hyperparameter ntree, mtry default

model_rf_1 <- randomForest::randomForest(y ~ ., ntree=500, data=data_sinta_train)

Prediksi pada Data Training

prediksi_prob_data_train <- predict(model_rf_1, data_sinta_train, type = “prob”) prediksi_data_train <- predict(model_rf_1, data_sinta_train,type=“class”) eval_model_rf_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train\(y, positive="1") eval_model_rf_1_train ROC_model_rf_1_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train\)y) plot(ROC_model_rf_1_train) ROC_model_rf_1_train$AUC

Prediksi pada Data Testing

prediksi_prob_data_test <- predict(model_rf_1, data_sinta_test, type = “prob”) prediksi_data_test <- predict(model_rf_1, data_sinta_test,type=“class”) eval_model_rf_1<- caret::confusionMatrix(prediksi_data_test, data_sinta_test\(y, positive="1") eval_model_rf_1 ROC_model_rf_1 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test\)y) plot(ROC_model_rf_1) ROC_model_rf_1$AUC

vip(model_rf_1, num_features = 50)

Perbandingan Hasil Model

hasil_eval <- rbind( c(eval_reglog_1\(overall[1], eval_reglog_1\)byClass[1], eval_reglog_1\(byClass[2]), c(eval_reglog_2\)overall[1], eval_reglog_2\(byClass[1], eval_reglog_2\)byClass[2]), c(eval_tree_1\(overall[1], eval_tree_1\)byClass[1], eval_tree_1\(byClass[2]), c(eval_tree_2\)overall[1], eval_tree_2\(byClass[1], eval_tree_2\)byClass[2]), c(eval_tree_3\(overall[1], eval_tree_3\)byClass[1], eval_tree_3\(byClass[2]), c(eval_tree_4\)overall[1], eval_tree_4\(byClass[1], eval_tree_4\)byClass[2]), c(eval_model_bag_1\(overall[1], eval_model_bag_1\)byClass[1], eval_model_bag_1\(byClass[2]), c(eval_model_rf_1\)overall[1], eval_model_rf_1\(byClass[1], eval_model_rf_1\)byClass[2])) row.names(hasil_eval) <- c(“RegLog Semua Peubah”,“RegLog Seleksi Peubah”, “ClassTree 1”,“ClassTree 2”,“ClassTree 3”,“ClassTree 4”, “Bagging 1”, “RandomForest 1”) hasil_eval <- as.data.frame(hasil_eval) dplyr::arrange(.data = hasil_eval, desc(Accuracy))