R04STA1381: Analisis Data Bagian 1

Library

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.1.0     v forcats 0.5.1

## Warning: package 'ggplot2' was built under R version 4.1.2

## Warning: package 'tidyr' was built under R version 4.1.2

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'purrr' was built under R version 4.1.2

## Warning: package 'stringr' was built under R version 4.1.2

## Warning: package 'forcats' was built under R version 4.1.2

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.1.3

## 
## Attaching package: 'kableExtra'

## The following object is masked from 'package:dplyr':
## 
##     group_rows

library(ggplot2)
library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.1.3

library(stringr)
library(reshape2)

## Warning: package 'reshape2' was built under R version 4.1.2

## 
## Attaching package: 'reshape2'

## The following object is masked from 'package:tidyr':
## 
##     smiths

library(mice)

## Warning: package 'mice' was built under R version 4.1.3

## 
## Attaching package: 'mice'

## The following object is masked from 'package:stats':
## 
##     filter

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(nortest)
library(DescTools)

## Warning: package 'DescTools' was built under R version 4.1.2

library(caret)

## Warning: package 'caret' was built under R version 4.1.2

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following objects are masked from 'package:DescTools':
## 
##     MAE, RMSE

## The following object is masked from 'package:purrr':
## 
##     lift

library(rpart)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.1.3

library(ROCit)

## Warning: package 'ROCit' was built under R version 4.1.3

library(PRROC)

## Warning: package 'PRROC' was built under R version 4.1.3

library(ROCR)

## Warning: package 'ROCR' was built under R version 4.1.3

library(vip)

## Warning: package 'vip' was built under R version 4.1.3

## 
## Attaching package: 'vip'

## The following object is masked from 'package:utils':
## 
##     vi

library(pillar)

## 
## Attaching package: 'pillar'

## The following object is masked from 'package:mice':
## 
##     squeeze

## The following object is masked from 'package:dplyr':
## 
##     dim_desc

library(readxl)

## Warning: package 'readxl' was built under R version 4.1.3

library(dplyr)

Data

Authors UNP

df_authors_ok <- read_xlsx("C:\\Users\\User\\Documents\\KULIAHHHHHHHH\\SMT 5\\PSD\\Data IPB PSD.xlsx")
#df_authors_ok <- df_authors_ok %>%  select(-c(X)) #Menghilangkan indeks
#struktur data sebelum formating
str(df_authors_ok)

## tibble [1,882 x 30] (S3: tbl_df/tbl/data.frame)
##  $ SINTA_ID                     : num [1:1882] 6004439 6690560 5974526 5975831 21886 ...
##  $ Nama                         : chr [1:1882] "DAHLANG T" "ANDI NILAWATI USMAN" "MOCHAMMAD HATTA" "ANWAR MALLONGI" ...
##  $ Universitas                  : chr [1:1882] "Universitas Hasanuddin" "Universitas Hasanuddin" "Universitas Hasanuddin" "Universitas Hasanuddin" ...
##  $ Kode_Prodi                   : chr [1:1882] "45001" "15101" "11725" "13001" ...
##  $ Departemen                   : chr [1:1882] "S3 - Fisika" "S2 - Ilmu Kebidanan" "Sp-1 - Mikrobiologi Klinik" "S3 - Ilmu Kesehatan Masyarakat" ...
##  $ Jenjang                      : chr [1:1882] "S3" "S2" "Sp-1" "S3" ...
##  $ Prodi                        : chr [1:1882] "Fisika" "Ilmu Kebidanan" "Mikrobiologi Klinik" "Ilmu Kesehatan Masyarakat" ...
##  $ SINTA_Score_Overall          : num [1:1882] 5198 2535 10592 3275 3115 ...
##  $ SINTA_Score_3Yr              : num [1:1882] 2208 2086 2007 1913 1878 ...
##  $ Affil_Score                  : num [1:1882] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Affil_Score_3Yr              : num [1:1882] 0 0 0 0 0 0 0 0 0 0 ...
##  $ Scopus_Artikel               : num [1:1882] 140 67 245 185 73 74 147 133 127 80 ...
##  $ Scopus_Citation              : num [1:1882] 1360 85 2874 836 916 ...
##  $ Scopus_H_Index               : num [1:1882] 20 4 31 17 21 10 13 40 12 16 ...
##  $ GScholar_Artikel             : num [1:1882] 215 132 603 422 116 86 237 240 171 104 ...
##  $ GScholar_Citation            : num [1:1882] 1726 255 5493 1473 949 ...
##  $ GScholar_H_Index             : num [1:1882] 22 8 36 18 20 15 15 33 15 20 ...
##  $ WOS_Artikel                  : num [1:1882] 69 56 198 0 22 35 17 67 18 56 ...
##  $ WOS_Citation                 : num [1:1882] 1067 64 5268 0 127 ...
##  $ WOS_H_Index                  : chr [1:1882] "18" "4" "34" "NA" ...
##  $ Status                       : chr [1:1882] "Aktif" "Aktif" "Aktif" "Aktif" ...
##  $ Akreditasi                   : chr [1:1882] "Belum Terakditasi" "Baik Sekali" "Baik Sekali" "Unggul" ...
##  $ Jumlah_Dosen_Penghitung_Rasio: chr [1:1882] "1" "22" "8" "15" ...
##  $ Jumlah_Dosen_NIDN            : chr [1:1882] "6" "6" "4" "9" ...
##  $ Jumlah_Dosen_NIDK            : chr [1:1882] "0" "0" "0" "0" ...
##  $ Jumlah_Dosen_Total           : chr [1:1882] "6" "6" "4" "9" ...
##  $ Jumlah_Mahasiswa             : chr [1:1882] "2" "218" "24" "152" ...
##  $ Rasio_Dosen_Per_Mahasiswa    : chr [1:1882] "50" "10.0917431192661" "33.3333333333333" "9.8684210526315805" ...
##  $ jumlah_artikel               : num [1:1882] 424 255 1046 607 211 ...
##  $ keterangan                   : chr [1:1882] "Diatas Median" "Diatas Median" "Diatas Median" "Diatas Median" ...

Data Formatting

df_authors_ok$Jenjang <- as.factor(df_authors_ok$Jenjang)
df_authors_ok$Akreditasi <- as.factor(df_authors_ok$Akreditasi)
df_authors_ok$Status <- as.factor(df_authors_ok$Status)
df_authors_ok$keterangan  <- as.factor(df_authors_ok$keterangan)

#struktur data setelah formating
glimpse(df_authors_ok[,c("Jenjang", "Akreditasi", "Status", "keterangan")])

## Rows: 1,882
## Columns: 4
## $ Jenjang    <fct> S3, S2, Sp-1, S3, S3, S3, Sp-1, S3, S2, S2, S2, S3, S1, S1,~
## $ Akreditasi <fct> Belum Terakditasi, Baik Sekali, Baik Sekali, Unggul, Baik S~
## $ Status     <fct> Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Akt~
## $ keterangan <fct> Diatas Median, Diatas Median, Diatas Median, Diatas Median,~

Re-Level Factor

levels(df_authors_ok$Jenjang) #level awal

## [1] "D3"      "D4"      "Profesi" "S1"      "S2"      "S3"      "Sp-1"   
## [8] "Sp-2"    "Unknown"

df_authors_ok$Jenjang <- factor(df_authors_ok$Jenjang,levels(df_authors_ok$Jenjang)[c(9,1,2,4,3,7,5,8,6)]) #re-level
levels(df_authors_ok$Jenjang) #setelah re-level

## [1] "Unknown" "D3"      "D4"      "S1"      "Profesi" "Sp-1"    "S2"     
## [8] "Sp-2"    "S3"

levels(df_authors_ok$Akreditasi)

## [1] "Baik"              "Baik Sekali"       "Belum Terakditasi"
## [4] "NA"                "Unggul"

df_authors_ok$Akreditasi <- factor(df_authors_ok$Akreditasi,levels(df_authors_ok$Akreditasi)[c(4,3,1,2,5)])
levels(df_authors_ok$Akreditasi)

## [1] "NA"                "Belum Terakditasi" "Baik"             
## [4] "Baik Sekali"       "Unggul"

Rumpun Ilmu dari Prodi

df_authors_ok$Kode_Prodi

##    [1] "45001" "15101" "11725" "13001" "44002" "54001" "12704" "54041" "11106"
##   [10] "48101" "54145" "13001" "54252" "48201" "47001" "41231" "54145" "13001"
##   [19] "54051" "41111" "22201" "54231" "41101" "54241" "12101" "54001" "11729"
##   [28] "54246" "54295" "54001" "44101" "13211" "47201" "15101" "54241" "13701"
##   [37] "12701" "54211" "11725" "55102" "47101" "22001" "73201" "14101" "54231"
##   [46] "11725" "13001" "47101" "13141" "47001" "54001" "54294" "54231" "11106"
##   [55] "20101" "61201" "22101" "54101" "22201" "44002" "48001" "NA"    "54252"
##   [64] "14101" "54241" "22101" "45001" "11109" "54111" "54241" "20101" "34002"
##   [73] "74201" "54261" "88003" "54201" "48001" "11704" "54151" "11001" "41231"
##   [82] "54106" "54241" "22001" "54101" "54141" "14201" "54131" "11001" "41103"
##   [91] "54211" "54041" "13211" "13161" "54201" "13101" "48201" "41231" "20001"
##  [100] "95129" "13141" "54106" "54201" "20001" "11106" "54001" "13001" "54108"
##  [109] "54131" "54111" "38201" "54001" "54108" "54295" "22201" "54001" "13001"
##  [118] "54001" "95129" "54211" "47001" "13141" "79002" "54051" "13701" "54141"
##  [127] "70001" "11734" "54294" "54231" "11707" "54294" "54231" "11109" "79103"
##  [136] "55201" "54242" "20001" "13161" "54249" "54031" "54001" "45201" "46104"
##  [145] "31201" "54295" "54146" "14201" "60003" "11708" "22101" "95103" "13101"
##  [154] "47101" "11001" "14201" "41111" "11707" "54243" "69001" "22001" "54231"
##  [163] "54231" "54111" "13101" "74001" "54242" "13211" "20101" "13141" "46104"
##  [172] "54101" "20001" "NA"    "46201" "21101" "54231" "14101" "13161" "13101"
##  [181] "11702" "54246" "22201" "12705" "54231" "95129" "13001" "11109" "54231"
##  [190] "64201" "41201" "33201" "13211" "54001" "54342" "21201" "20001" "13001"
##  [199] "61201" "20101" "20101" "54243" "41322" "13161" "54251" "20101" "15101"
##  [208] "54241" "44201" "54051" "13161" "49101" "11718" "20001" "NA"    "22101"
##  [217] "13161" "54001" "95130" "44101" "54256" "13101" "34002" "13201" "54241"
##  [226] "47201" "54211" "57201" "20201" "23201" "54251" "55102" "54146" "95129"
##  [235] "13141" "54201" "13211" "54335" "48901" "44201" "45201" "54211" "54256"
##  [244] "21101" "11724" "41322" "45102" "11734" "48901" "11718" "64201" "33101"
##  [253] "41103" "54201" "22001" "14101" "54231" "95129" "22201" "13201" "NA"   
##  [262] "54201" "95101" "NA"    "54151" "48201" "13161" "20101" "33101" "54201"
##  [271] "21201" "44101" "54141" "73201" "54151" "54211" "54231" "54031" "22101"
##  [280] "13101" "35103" "11201" "54131" "21001" "11709" "54231" "54051" "54151"
##  [289] "54243" "11724" "54031" "54241" "47101" "54131" "33101" "41103" "12705"
##  [298] "54201" "20201" "61001" "54231" "95101" "45001" "12101" "46201" "61101"
##  [307] "13161" "NA"    "31101" "54146" "15101" "22001" "54211" "54106" "34201"
##  [316] "79203" "54141" "35103" "54252" "54001" "74201" "54231" "13211" "54241"
##  [325] "NA"    "23001" "NA"    "54111" "41231" "NA"    "22101" "54131" "54111"
##  [334] "54335" "54252" "55102" "48001" "12705" "54251" "64201" "20001" "38201"
##  [343] "11109" "13001" "23902" "21201" "54246" "14201" "21001" "88003" "21201"
##  [352] "41201" "44201" "54231" "14201" "41111" "21101" "54231" "54251" "46104"
##  [361] "54051" "26101" "54201" "95130" "54201" "47201" "20201" "55102" "13201"
##  [370] "54231" "74102" "54211" "54031" "54242" "13701" "54231" "14401" "41103"
##  [379] "54335" "54251" "54242" "54111" "54141" "54242" "54106" "95103" "74235"
##  [388] "80101" "54243" "46104" "49101" "54131" "48001" "39101" "23001" "69001"
##  [397] "54231" "61001" "22001" "45102" "54031" "54251" "48001" "13201" "54295"
##  [406] "11715" "38201" "33201" "21001" "54001" "36202" "23201" "25201" "54251"
##  [415] "60003" "47001" "11704" "47001" "21001" "48901" "61102" "48101" "NA"   
##  [424] "41322" "48101" "41101" "61107" "47101" "11201" "54201" "54246" "54231"
##  [433] "74235" "21201" "NA"    "54201" "39101" "54231" "22201" "81101" "36101"
##  [442] "54295" "35103" "22201" "12201" "22101" "41322" "41231" "54295" "38201"
##  [451] "NA"    "54251" "11719" "46201" "54252" "11704" "44002" "11704" "45001"
##  [460] "54051" "13101" "41101" "54211" "46104" "34002" "26101" "70201" "54241"
##  [469] "23101" "48001" "54251" "54031" "34201" "36201" "61107" "13101" "11729"
##  [478] "38201" "21001" "54201" "22201" "13101" "12901" "34101" "26201" "54242"
##  [487] "61201" "74201" "95101" "36202" "64201" "12706" "49201" "54342" "12705"
##  [496] "54251" "11201" "11708" "11708" "95130" "54252" "12901" "12705" "54249"
##  [505] "54201" "12705" "73201" "33201" "54261" "14901" "54241" "60001" "47201"
##  [514] "54242" "11201" "61001" "54251" "62101" "41103" "54108" "54151" "54251"
##  [523] "82001" "48101" "74201" "11703" "55201" "22201" "48201" "54201" "54251"
##  [532] "54251" "11721" "13201" "67001" "54211" "54243" "54211" "22201" "74101"
##  [541] "23001" "23101" "13211" "54243" "54001" "26101" "54335" "NA"    "21201"
##  [550] "54246" "54145" "54231" "25201" "74101" "54241" "36101" "79103" "54231"
##  [559] "74101" "74101" "67201" "20101" "33201" "13201" "54145" "54231" "45001"
##  [568] "47001" "34201" "61101" "NA"    "54201" "48101" "48901" "61107" "26201"
##  [577] "38201" "54251" "54041" "48201" "11201" "NA"    "34201" "61101" "11201"
##  [586] "14901" "46101" "54211" "55201" "NA"    "20101" "95103" "61102" "11715"
##  [595] "45102" "13211" "NA"    "48901" "11201" "54131" "69201" "48201" "54252"
##  [604] "47201" "54251" "35103" "13211" "44101" "14901" "62001" "54211" "11106"
##  [613] "54241" "11001" "54201" "54211" "35101" "54243" "81201" "12901" "12201"
##  [622] "13201" "NA"    "54106" "47201" "62001" "31101" "48201" "48201" "45201"
##  [631] "11708" "13201" "11901" "54241" "11707" "49101" "54231" "46101" "54231"
##  [640] "13201" "11712" "81201" "54001" "61001" "13101" "11708" "54295" "81201"
##  [649] "54106" "46201" "14101" "11712" "70101" "13201" "54294" "11201" "54249"
##  [658] "54246" "25201" "45102" "26201" "22201" "69101" "45001" "26201" "54231"
##  [667] "70201" "62001" "54211" "13101" "11705" "61102" "12101" "74201" "11201"
##  [676] "95103" "95105" "23902" "12706" "54211" "64201" "54302" "54111" "11202"
##  [685] "23902" "11711" "54241" "33101" "64201" "33101" "74235" "48101" "46201"
##  [694] "14401" "36201" "48201" "12901" "79001" "44201" "49101" "41201" "11703"
##  [703] "13201" "20201" "46101" "54141" "20201" "13201" "12201" "79102" "13101"
##  [712] "44002" "23201" "48201" "38201" "38201" "65101" "54231" "11715" "11201"
##  [721] "74001" "41231" "11702" "54211" "46201" "67101" "48201" "54245" "82101"
##  [730] "35201" "31101" "11724" "54243" "47001" "11703" "25201" "61106" "20201"
##  [739] "54261" "31101" "54243" "11201" "22201" "11712" "NA"    "26201" "11201"
##  [748] "12201" "62101" "54242" "13201" "39101" "54245" "12703" "22201" "54231"
##  [757] "54251" "54108" "36201" "NA"    "54242" "11201" "25201" "61101" "13201"
##  [766] "54211" "21101" "26101" "46201" "54142" "12707" "12703" "11109" "54242"
##  [775] "23101" "54256" "61001" "54231" "61201" "54211" "11708" "23201" "79002"
##  [784] "54294" "13101" "54251" "11202" "21201" "22201" "63001" "82001" "35201"
##  [793] "95103" "12704" "NA"    "14201" "67001" "54241" "74001" "NA"    "79002"
##  [802] "79214" "64201" "48201" "54245" "45102" "13211" "34201" "61001" "45102"
##  [811] "41322" "54246" "26101" "11901" "11201" "54342" "13201" "11201" "54241"
##  [820] "54251" "46101" "22201" "54245" "44201" "74102" "54231" "54146" "NA"   
##  [829] "54243" "22201" "12704" "55102" "41101" "47201" "41101" "36201" "54231"
##  [838] "44101" "44201" "57201" "70001" "54249" "11701" "54243" "54245" "61201"
##  [847] "22101" "11723" "54231" "62101" "36201" "12701" "11732" "74201" "11201"
##  [856] "44201" "47101" "54256" "13141" "48201" "36202" "11202" "57201" "12705"
##  [865] "62001" "74102" "11201" "54106" "54231" "54318" "54041" "12703" "93304"
##  [874] "54241" "54146" "95130" "54243" "69201" "11711" "54252" "33201" "61102"
##  [883] "74201" "41231" "55201" "11201" "70101" "NA"    "64201" "54342" "74201"
##  [892] "54231" "21201" "41111" "11724" "41111" "41103" "23101" "79102" "11201"
##  [901] "54246" "13201" "48901" "48901" "34101" "60101" "11701" "54251" "23001"
##  [910] "54245" "11201" "12201" "61101" "47201" "12707" "74235" "11902" "54318"
##  [919] "54261" "61101" "12707" "12707" "54146" "54245" "70201" "79103" "41101"
##  [928] "20201" "11201" "54318" "49201" "11201" "54201" "94203" "11201" "12703"
##  [937] "62201" "95101" "23201" "62101" "21101" "12707" "36101" "23001" "79101"
##  [946] "54108" "81201" "22201" "34201" "34201" "11729" "88003" "54249" "47201"
##  [955] "54251" "74102" "54335" "NA"    "67001" "63201" "38201" "14401" "11202"
##  [964] "70201" "74001" "26201" "11902" "54249" "82101" "54108" "74101" "36101"
##  [973] "11710" "12901" "35103" "82101" "54231" "11721" "81101" "46201" "12201"
##  [982] "49101" "12901" "12201" "13201" "11708" "61101" "12901" "54249" "80201"
##  [991] "61107" "35201" "73201" "48001" "12703" "14201" "54241" "74201" "79103"
## [1000] "41231" "NA"    "60101" "11711" "NA"    "36101" "63201" "33201" "11901"
## [1009] "54251" "69001" "20201" "70201" "74102" "54231" "NA"    "74001" "11106"
## [1018] "55201" "79001" "13201" "38201" "22201" "54241" "35201" "14401" "61102"
## [1027] "69101" "12201" "20101" "13201" "61001" "74001" "49201" "46201" "54211"
## [1036] "41111" "20201" "62001" "11719" "38201" "14401" "82201" "48201" "12707"
## [1045] "11201" "62201" "NA"    "61107" "36202" "74001" "26201" "54142" "54251"
## [1054] "54256" "63001" "11201" "63001" "74102" "61001" "11705" "36202" "54231"
## [1063] "13211" "12706" "13211" "15101" "26201" "54261" "31201" "21001" "61101"
## [1072] "79101" "95101" "35103" "NA"    "36202" "54242" "95105" "12201" "48901"
## [1081] "11705" "79202" "11201" "34201" "74101" "11705" "54243" "74102" "82001"
## [1090] "54241" "23001" "34201" "35201" "74101" "54241" "62201" "36201" "23201"
## [1099] "26201" "11709" "11705" "23201" "54342" "26101" "69101" "36202" "47201"
## [1108] "23902" "35103" "65101" "11705" "54146" "13201" "54261" "49201" "48001"
## [1117] "11718" "12706" "61201" "34101" "31201" "46201" "12701" "54243" "41201"
## [1126] "15401" "60003" "46201" "82001" "22102" "70101" "11724" "79201" "35201"
## [1135] "22201" "12101" "63001" "11202" "69101" "82201" "11732" "31201" "NA"   
## [1144] "95105" "23001" "13201" "14201" "11202" "33201" "34002" "35201" "14201"
## [1153] "20101" "11711" "54245" "11712" "61102" "81101" "54231" "11807" "74001"
## [1162] "61201" "13201" "11901" "11724" "11201" "60001" "61201" "20201" "11721"
## [1171] "62201" "54041" "67001" "54231" "69201" "63201" "54211" "54961" "67101"
## [1180] "36201" "46101" "11201" "13161" "62201" "13201" "34201" "NA"    "57201"
## [1189] "49201" "74101" "88003" "31201" "95105" "54245" "35103" "12901" "54231"
## [1198] "46201" "79101" "65201" "11723" "36201" "49201" "36201" "54294" "82201"
## [1207] "36201" "81101" "94203" "74001" "14401" "82201" "74001" "74201" "54142"
## [1216] "74102" "65101" "95105" "54142" "31101" "48201" "82001" "11706" "79203"
## [1225] "NA"    "36202" "13211" "48201" "NA"    "64201" "70001" "65201" "45201"
## [1234] "60001" "13201" "23201" "88003" "14401" "54231" "23201" "20201" "55201"
## [1243] "NA"    "14201" "54243" "79204" "13201" "54231" "45201" "23201" "54146"
## [1252] "44101" "62101" "39101" "79205" "80201" "23201" "11201" "64201" "12703"
## [1261] "36202" "79002" "11902" "54961" "NA"    "95105" "NA"    "54242" "54242"
## [1270] "80101" "11201" "11705" "95129" "11712" "82101" "54142" "80201" "80101"
## [1279] "33101" "48201" "NA"    "65101" "14201" "12201" "12901" "34201" "34101"
## [1288] "61106" "47201" "12901" "74001" "41201" "62201" "48201" "94203" "12201"
## [1297] "11201" "57201" "57201" "54251" "79002" "11201" "82201" "13211" "74201"
## [1306] "74201" "35201" "54295" "73201" "47201" "70101" "14401" "11705" "11201"
## [1315] "54231" "14401" "95105" "14201" "54295" "55201" "62201" "14401" "11902"
## [1324] "79205" "46201" "74001" "11201" "79001" "14901" "74235" "54261" "13201"
## [1333] "48201" "79201" "12201" "25201" "54245" "54261" "80101" "48901" "54261"
## [1342] "60201" "11701" "79101" "61101" "35201" "11201" "61201" "79205" "49201"
## [1351] "74201" "61201" "54242" "11702" "38201" "55201" "46101" "74201" "23201"
## [1360] "54261" "79201" "13701" "21101" "36202" "47201" "NA"    "57201" "12703"
## [1369] "55201" "22201" "41201" "41231" "11202" "74235" "11807" "81101" "79204"
## [1378] "11201" "35201" "12701" "47201" "23201" "61102" "36202" "21201" "46201"
## [1387] "82101" "69201" "20201" "95105" "63201" "79205" "13201" "44002" "69201"
## [1396] "74201" "70101" "26201" "61102" "20101" "62901" "74001" "61106" "79204"
## [1405] "NA"    "61001" "60201" "11701" "74235" "25201" "11201" "35201" "74101"
## [1414] "69201" "73201" "79201" "64201" "11202" "14201" "54242" "69001" "11201"
## [1423] "54261" "61101" "69201" "NA"    "62201" "74201" "23902" "69201" "79211"
## [1432] "54211" "54241" "NA"    "67201" "67201" "74201" "12701" "79205" "80201"
## [1441] "79202" "12201" "41231" "11750" "54252" "63201" "64201" "64201" "79205"
## [1450] "60201" "NA"    "23201" "23201" "NA"    "70201" "23201" "12901" "54245"
## [1459] "15101" "34002" "73201" "20201" "67201" "95130" "79102" "79202" "11201"
## [1468] "62101" "79203" "NA"    "65201" "11711" "NA"    "11723" "79001" "21201"
## [1477] "22201" "13201" "11724" "63001" "74001" "60001" "79002" "13201" "70001"
## [1486] "74102" "79202" "79203" "62901" "54242" "11901" "11711" "70201" "54245"
## [1495] "74001" "65201" "11703" "39101" "79202" "79001" "54201" "74201" "67201"
## [1504] "79211" "79002" "65201" "57201" "79203" "54261" "61201" "11719" "NA"   
## [1513] "12901" "74101" "11725" "81201" "74201" "54241" "69201" "79211" "14901"
## [1522] "79202" "73201" "21201" "33101" "11001" "NA"    "74201" "54251" "74235"
## [1531] "NA"    "54318" "12201" "11711" "74201" "79102" "NA"    "60201" "79102"
## [1540] "46201" "79211" "11201" "74201" "79001" "74101" "13201" "36201" "67001"
## [1549] "54961" "NA"    "54251" "79214" "NA"    "61201" "79203" "54242" "79203"
## [1558] "79103" "60001" "74201" "NA"    "11201" "64201" "54251" "NA"    "60201"
## [1567] "46201" "79204" "73201" "69001" "74201" "12201" "11721" "74001" "NA"   
## [1576] "34201" "NA"    "74201" "63001" "61107" "47201" "61201" "45201" "NA"   
## [1585] "82101" "79203" "74201" "54231" "94203" "61201" "11202" "13201" "NA"   
## [1594] "74102" "70101" "46201" "74201" "20101" "79203" "NA"    "74201" "74201"
## [1603] "23101" "11706" "79203" "73201" "74201" "74235" "79202" "69201" "25201"
## [1612] "74102" "74101" "46201" "79203" "60001" "54211" "11201" "11201" "60001"
## [1621] "NA"    "73201" "62201" "55201" "93304" "NA"    "NA"    "NA"    "NA"   
## [1630] "11901" "NA"    "14401" "74201" "54252" "NA"    "NA"    "41322" "11702"
## [1639] "73201" "NA"    "70201" "79204" "61201" "NA"    "35201" "NA"    "11201"
## [1648] "12901" "41201" "NA"    "14201" "94203" "33201" "81201" "74201" "36201"
## [1657] "NA"    "70201" "79204" "12201" "65201" "60201" "70201" "12901" "49201"
## [1666] "74101" "79202" "63001" "11718" "13201" "11901" "74102" "34201" "11723"
## [1675] "60201" "NA"    "11201" "62201" "NA"    "23902" "61201" "79214" "60201"
## [1684] "70201" "12701" "NA"    "74102" "61201" "79101" "79201" "64201" "74102"
## [1693] "70101" "20101" "63201" "95105" "95105" "63001" "74001" "63001" "74001"
## [1702] "12706" "67101" "60001" "74101" "46201" "62201" "60201" "61107" "60201"
## [1711] "82201" "54243" "74201" "74201" "NA"    "62201" "79204" "74201" "61201"
## [1720] "70201" "NA"    "69201" "61107" "11710" "63201" "79202" "NA"    "49201"
## [1729] "47201" "60101" "46201" "NA"    "33201" "60201" "12704" "12201" "80201"
## [1738] "74102" "11201" "23101" "46201" "11201" "45201" "NA"    "79202" "NA"   
## [1747] "79102" "79204" "79203" "79205" "23201" "20101" "20101" "20101" "20101"
## [1756] "20101" "20101" "20101" "NA"    "20101" "20101" "20101" "20101" "20101"
## [1765] "NA"    "11201" "48201" "NA"    "54242" "21401" "67201" "11734" "11725"
## [1774] "63201" "79211" "NA"    "11715" "21201" "21201" "69201" "69201" "69101"
## [1783] "70001" "61101" "54294" "11201" "80201" "12706" "79201" "12901" "73201"
## [1792] "31201" "70201" "13141" "55201" "NA"    "74201" "74101" "74101" "11901"
## [1801] "79204" "79202" "74101" "79202" "11201" "11715" "11701" "11201" "11201"
## [1810] "11201" "11201" "11201" "11807" "11201" "11709" "11201" "NA"    "12201"
## [1819] "12101" "12701" "NA"    "48201" "NA"    "64201" "54231" "64201" "NA"   
## [1828] "64201" "11902" "54142" "11202" "73201" "73201" "74102" "54961" "63201"
## [1837] "NA"    "20201" "NA"    "11202" "NA"    "93304" "54261" "NA"    "NA"   
## [1846] "54961" "79201" "62201" "12706" "NA"    "NA"    "21201" "60201" "11201"
## [1855] "93304" "67201" "73201" "48901" "65101" "70201" "11710" "NA"    "93304"
## [1864] "44201" "93304" "34201" "34201" "63201" "NA"    "13201" "NA"    "69201"
## [1873] "74201" "47201" "NA"    "93304" "62201" "60201" "11202" "13201" "NA"   
## [1882] "NA"

#Membentuk rumpun ilmu berdasarkan kode prodi 5 digit
df_rumpun <- df_authors_ok %>% 
  select(Kode_Prodi,Prodi)  %>% 
  group_by(Kode_Prodi,Prodi) %>%
  summarize() %>% 
  mutate(Kode_Prodi_5Digit = substr(Kode_Prodi,1,5))

## `summarise()` has grouped output by 'Kode_Prodi'. You can override using the `.groups` argument.

view(df_rumpun)

#Membentuk rumpun ilmu berdasarkan kode prodi 2 digit
df_rumpun <- df_rumpun %>% 
  mutate(Rumpun_Ilmu = case_when(Kode_Prodi_5Digit %in% c(79102,95127,88003,79002,79001,93202,79203,79211,79201,79202,79204,79205,79214,93304,81201,81101,79101,79103) ~ "Ilmu Budaya",
                                 Kode_Prodi_5Digit %in% c(11106,13111,15101,14101,13101,11109,12101,13141,11807,11732,11734,48001,11001,12001,13001,12201,73201,14401,14401,11409,11201,54261,13201,13211,11202,12301,11902,48901,11901,12901,54961,14901,11706,11704,48201,14201,12705,11707,11717,12701,11712,13701,11708,11721,11710,12704,11711,12703,11718,11719,11702,11715,11704,11701,12702,11705,11725,11703,12706,12707,12707,11724,11709,11723,11750,11729,13163,48101) ~ "Kesehatan",
                                 Kode_Prodi_5Digit %in% c(41103,54131,41101,95103,35103,54108,41111,20101,26101,55102,95101,21101,22102,36101,31101,22101,39101,23001,22001,60003,20001,21001,34002,35201,20201,34201,26201,55201,38201,25201,21201,36201,31201,41201,22201,36202,23902,23201,23101) ~ "Teknik",
                                 Kode_Prodi_5Digit %in% c(82101,74101,70101,65101,67101,95130,74102,61107,61106,61101,22106,35101,61102,80101,69101,63001,62001,82001,60001,74001,70001,67001,61001,69001,61201,54245,69201,70201,65201,67201,80201,63201,74235,62901,86207,63201,62201,82201,60201,64201,74201,95105,13161,62101,54142,60101,54142) ~ "Ilmu Sosial dan Politik",
                                 Kode_Prodi_5Digit %in% c(33101,34101,54106,54151,54141,54146,13151,47101,44101,46104,95129,54145,49101,45001,47001,54051,95029,54041,54001,54031,44002,47201,54256,54242,44201,54246,54231,54295,54252,57201,49201,45201,33201,57201,46201,94203,54249,54302,54335,54317,41322,54318,54201,54245,54211,54243,41231,54241,54294,46101,45102,54101,54111) ~ "MIPA",
            
                                 ))

#dataframe rumpun ilmu yang akan digunakan untuk di merge dengan data awal
df_rumpun_oke <- df_rumpun %>% select(Kode_Prodi,Rumpun_Ilmu)

Analisis

#struktur data
glimpse(df_authors_ok)

## Rows: 1,882
## Columns: 30
## $ SINTA_ID                      <dbl> 6004439, 6690560, 5974526, 5975831, 2188~
## $ Nama                          <chr> "DAHLANG T", "ANDI NILAWATI USMAN", "MOC~
## $ Universitas                   <chr> "Universitas Hasanuddin", "Universitas H~
## $ Kode_Prodi                    <chr> "45001", "15101", "11725", "13001", "440~
## $ Departemen                    <chr> "S3 - Fisika", "S2 - Ilmu Kebidanan", "S~
## $ Jenjang                       <fct> S3, S2, Sp-1, S3, S3, S3, Sp-1, S3, S2, ~
## $ Prodi                         <chr> "Fisika", "Ilmu Kebidanan", "Mikrobiolog~
## $ SINTA_Score_Overall           <dbl> 5198, 2535, 10592, 3275, 3115, 3054, 258~
## $ SINTA_Score_3Yr               <dbl> 2208, 2086, 2007, 1913, 1878, 1796, 1561~
## $ Affil_Score                   <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Affil_Score_3Yr               <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Scopus_Artikel                <dbl> 140, 67, 245, 185, 73, 74, 147, 133, 127~
## $ Scopus_Citation               <dbl> 1360, 85, 2874, 836, 916, 216, 692, 4017~
## $ Scopus_H_Index                <dbl> 20, 4, 31, 17, 21, 10, 13, 40, 12, 16, 1~
## $ GScholar_Artikel              <dbl> 215, 132, 603, 422, 116, 86, 237, 240, 1~
## $ GScholar_Citation             <dbl> 1726, 255, 5493, 1473, 949, 619, 986, 63~
## $ GScholar_H_Index              <dbl> 22, 8, 36, 18, 20, 15, 15, 33, 15, 20, 1~
## $ WOS_Artikel                   <dbl> 69, 56, 198, 0, 22, 35, 17, 67, 18, 56, ~
## $ WOS_Citation                  <dbl> 1067, 64, 5268, 0, 127, 58, 20, 3537, 17~
## $ WOS_H_Index                   <chr> "18", "4", "34", "NA", "7", "5", "3", "2~
## $ Status                        <fct> Aktif, Aktif, Aktif, Aktif, Aktif, Aktif~
## $ Akreditasi                    <fct> Belum Terakditasi, Baik Sekali, Baik Sek~
## $ Jumlah_Dosen_Penghitung_Rasio <chr> "1", "22", "8", "15", "11", "33", "14", ~
## $ Jumlah_Dosen_NIDN             <chr> "6", "6", "4", "9", "5", "12", "6", "6",~
## $ Jumlah_Dosen_NIDK             <chr> "0", "0", "0", "0", "0", "0", "2", "0", ~
## $ Jumlah_Dosen_Total            <chr> "6", "6", "4", "9", "5", "12", "8", "6",~
## $ Jumlah_Mahasiswa              <chr> "2", "218", "24", "152", "9", "124", "14~
## $ Rasio_Dosen_Per_Mahasiswa     <chr> "50", "10.0917431192661", "33.3333333333~
## $ jumlah_artikel                <dbl> 424, 255, 1046, 607, 211, 195, 401, 440,~
## $ keterangan                    <fct> Diatas Median, Diatas Median, Diatas Med~

Data

Unit Observasi = Authors

y = SINTA_Score_3Yr yang dikategorisasi menjadi tinggi dan rendah

x1 = Rumpun Ilmu (Ganjil 2021)

x2 = Level (Ganjil 2021)

x3 = Akreditasi (Ganjil 2021)

x4 = Total Jumlah Dosen (Ganjil 2021)

x5 = Jumlah Mahasiswa (Ganjil 2021)

x6 = Rasio Dosen per Mahasiswa (Ganjil 2021)

data_1 <- df_authors_ok %>%  
  left_join(df_rumpun_oke, by="Kode_Prodi") %>%  
  select(SINTA_Score_3Yr,Prodi,Rumpun_Ilmu,Jenjang,Akreditasi,Jumlah_Dosen_Total, Jumlah_Mahasiswa,jumlah_artikel) %>% 
  mutate(y = ifelse(SINTA_Score_3Yr>=205,"1","0")) #kelas 1:SINTA_Score_3Yr yang tinggi
data_1$y <- as.factor(data_1$y)
data_1$Rumpun_Ilmu <- as.factor(data_1$Rumpun_Ilmu)
data_1$Jumlah_Dosen_Total <- as.numeric(as.character(data_1$Jumlah_Dosen_Total))

## Warning: NAs introduced by coercion

data_1$Jumlah_Mahasiswa <- as.numeric(as.character(data_1$Jumlah_Mahasiswa))

## Warning: NAs introduced by coercion

data_1$y <- as.factor(data_1$y)
data_1$Rumpun_Ilmu <- as.factor(data_1$Rumpun_Ilmu)
str(data_1)

## tibble [2,012 x 9] (S3: tbl_df/tbl/data.frame)
##  $ SINTA_Score_3Yr   : num [1:2012] 2208 2086 2007 1913 1878 ...
##  $ Prodi             : chr [1:2012] "Fisika" "Ilmu Kebidanan" "Mikrobiologi Klinik" "Ilmu Kesehatan Masyarakat" ...
##  $ Rumpun_Ilmu       : Factor w/ 5 levels "Ilmu Budaya",..: 4 3 3 3 4 4 3 4 3 3 ...
##  $ Jenjang           : Factor w/ 9 levels "Unknown","D3",..: 9 7 6 9 9 9 6 9 7 7 ...
##  $ Akreditasi        : Factor w/ 5 levels "NA","Belum Terakditasi",..: 2 4 4 5 4 5 2 4 5 5 ...
##  $ Jumlah_Dosen_Total: num [1:2012] 6 6 4 9 5 12 8 6 6 5 ...
##  $ Jumlah_Mahasiswa  : num [1:2012] 2 218 24 152 9 124 14 56 175 110 ...
##  $ jumlah_artikel    : num [1:2012] 424 255 1046 607 211 ...
##  $ y                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

#Cek missing values
md.pattern(data_1,rotate.names = TRUE)

##      SINTA_Score_3Yr Prodi Jenjang Akreditasi jumlah_artikel y
## 1875               1     1       1          1              1 1
## 32                 1     1       1          1              1 1
## 105                1     1       1          1              1 1
##                    0     0       0          0              0 0
##      Jumlah_Dosen_Total Jumlah_Mahasiswa Rumpun_Ilmu    
## 1875                  1                1           1   0
## 32                    1                1           0   1
## 105                   0                0           0   3
##                     105              105         137 347

data_1 <- data_1 %>% filter(!is.na(Jumlah_Mahasiswa),!is.na(Jumlah_Dosen_Total),!is.na(Rumpun_Ilmu))
View(data_1) #data yang akan digunakan

## EDA Data 
### Peubah Respon (y)

#format data yang dibutuhkan
data_chart <- data_1 %>% 
  group_by(y) %>%  
  summarize(value=n()) %>%
  mutate(prop = round(value / sum(value) *100, digits = 2))

#pie chart: Sebaran Authors Berdasarkan Kategori SINTA_Score_3Yr
ggplot(data_chart, aes(x="", y=prop, fill=y)) +
  geom_bar(stat="identity", width=1, color="white") +
  coord_polar("y", start=0) +
  labs(title= "Proporsi Authors Menurut Kategori SINTA_Score_3Yr",
       subtitle = "Universitas Hasanuddin") +
  theme_void()

Peubah Prediktor (X) Numerik

#Density Jumlah_Dosen_Total
ggplot(data_1, aes(x=Jumlah_Dosen_Total)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=15)+
  theme_light() +
  labs(x="Jumlah_Dosen_Total",
       y="Density",
       title= "Sebaran Jumlah_Dosen_Total",
       subtitle = "Universitas Hasanuddin")

#Density Jumlah_Mahasiswa
ggplot(data_1, aes(x=Jumlah_Mahasiswa)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=15)+
  theme_light() +
  labs(x="Jumlah_Mahasiswa",
       y="Density",
       title= "Sebaran Jumlah_Mahasiswa",
       subtitle = "Universitas Hasanuddin")

#Density Rasio_Dosen_per_Mahasiswa
ggplot(data_1, aes(x=jumlah_artikel)) +
  geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=20)+
  theme_light() +
  labs(x="Jumlah Artikel",
       y="Density",
       title= "Sebaran Jumlah Artikel",
       subtitle = "Universitas Hasanuddin")

### Peubah Prediktor (X) Kategorik

# Akreditasi
data_bar_chart = data_1 %>%  
  group_by(Akreditasi)%>% 
  summarize(Jumlah=n())

ggplot(data_bar_chart, aes(x=Akreditasi, y=Jumlah)) + 
  geom_bar(stat = "identity",color="steelblue") +
  theme_light() +
  labs(x="",
       y="",
       title= "",
       subtitle = "Universitas Hasanuddin") +
  coord_flip()

# Level
data_bar_chart = data_1 %>%  
  group_by(Jenjang)%>% 
  summarize(Jumlah=n())

ggplot(data_bar_chart, aes(x=Jenjang, y=Jumlah)) + 
  geom_bar(stat = "identity",color="steelblue") +
  theme_light() +
  labs(x="",
       y="",
       title= "",
       subtitle = "Universitas Hasanuddin") +
  coord_flip()

# Rumpun Ilmu
data_bar_chart = data_1 %>%  
  group_by(Rumpun_Ilmu)%>% 
  summarize(Jumlah=n())

ggplot(data_bar_chart, aes(x=(Rumpun_Ilmu), y=Jumlah)) + 
  geom_bar(stat = "identity",color="steelblue") +
  theme_light() +
  labs(x="",
       y="",
       title= "",
       subtitle = "Universitas Hasanuddin") +
  coord_flip()

Hubungan Peubah Prediktor dengan Peubah Respon

# Akreditasi & y
percentData <- data_1 %>% 
  group_by(Akreditasi) %>% 
  count(y) %>% 
  mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Akreditasi),fill=y,))+
    geom_bar(position="fill")+
    scale_fill_manual(values=c("#7be217", "#4f58ab"))+ 
    geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
    labs( 
       y = "", 
       x = "Akreditasi", 
       subtitle = "Unhas",
       title = "Proporsi Peubah Respon Menurut Akreditasi")

# Level & y
percentData <- data_1 %>% 
  group_by(Jenjang) %>% 
  count(y) %>% 
  mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Jenjang),fill=y,))+
    geom_bar(position="fill")+
    scale_fill_manual(values=c("#7be217", "#4f58ab"))+ 
    geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
    labs( 
       y = "", 
       x = "Level", 
       subtitle = "Unhas",
       title = "Proporsi Peubah Respon Menurut Level")

# Rumpun_Ilmu & y
percentData <- data_1 %>% 
  group_by(Rumpun_Ilmu) %>% 
  count(y) %>% 
  mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Rumpun_Ilmu),fill=y,))+
    geom_bar(position="fill")+
    scale_fill_manual(values=c("#7be217", "#4f58ab"))+ 
    geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
    labs( 
       y = "", 
       x = "Rumpun_Ilmu", 
       subtitle = "UNP",
       title = "Proporsi Peubah Respon Menurut Rumpun Ilmu")

# Jumlah_Mahasiswa & y

#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Jumlah_Mahasiswa,fill=Jumlah_Mahasiswa,alpha=Jumlah_Mahasiswa)) + 
  geom_boxplot(fill="#69b3a2",  alpha=0.8) +
  theme_light() +
  labs(x="Jumlah_Mahasiswa",
       y="y",
       title= "Sebaran Jumlah Mahasiswa Menurut Peubah Respon",
       subtitle = "Universitas Hasanuddin")

# Jumlah_Dosen_Total & y

#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Jumlah_Dosen_Total,fill=Jumlah_Dosen_Total,alpha=Jumlah_Dosen_Total)) + 
  geom_boxplot(fill="#69b3a2",  alpha=0.8) +
  theme_light() +
  labs(x="Jumlah_Dosen_Total",
       y="y",
       title= "Sebaran Jumlah Dosen Total Menurut Peubah Respon",
       subtitle = "Universitas Hasanuddin")

# Rasio_Dosen_per_Mahasiswa & y

#Boxplot by kategori
ggplot(data_1, aes(y=y,x=jumlah_artikel,fill=jumlah_artikel,alpha=jumlah_artikel)) + 
  geom_boxplot(fill="#69b3a2",  alpha=0.8) +
  theme_light() +
  labs(x="Jumlah_Artikel",
       y="y",
       title= "Sebaran Jumlah Artikel Menurut Peubah Respon",
       subtitle = "Universitas Hasanuddin")

Data Model

#data yang akan digunakan untuk model
data_sinta <- data_1 %>% select(-c(SINTA_Score_3Yr,Prodi))
str(data_sinta)

## tibble [1,875 x 7] (S3: tbl_df/tbl/data.frame)
##  $ Rumpun_Ilmu       : Factor w/ 5 levels "Ilmu Budaya",..: 4 3 3 3 4 4 3 4 3 3 ...
##  $ Jenjang           : Factor w/ 9 levels "Unknown","D3",..: 9 7 6 9 9 9 6 9 7 7 ...
##  $ Akreditasi        : Factor w/ 5 levels "NA","Belum Terakditasi",..: 2 4 4 5 4 5 2 4 5 5 ...
##  $ Jumlah_Dosen_Total: num [1:1875] 6 6 4 9 5 12 8 6 6 5 ...
##  $ Jumlah_Mahasiswa  : num [1:1875] 2 218 24 152 9 124 14 56 175 110 ...
##  $ jumlah_artikel    : num [1:1875] 424 255 1046 607 211 ...
##  $ y                 : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...

Splitting Data

set.seed(478)
in.train <- createDataPartition(as.factor(data_sinta$y),p=0.7,list=F) #partisi data
data_sinta_train <- data_sinta[in.train,] #data training utk modelling
data_sinta_test<- data_sinta[-in.train,] #data testing utk evaluasi model

#proporsi kelas peubah respon pada data
round(prop.table(table(data_sinta_train$y)), digits = 4)

## 
##      0      1 
## 0.7443 0.2557

round(prop.table(table(data_sinta_test$y)), digits = 4)

## 
##      0      1 
## 0.7451 0.2549

Regresi Logistik

Semua Peubah

model_reglog_1 <- glm(y~., data_sinta_train, family=binomial())
summary(model_reglog_1)

## 
## Call:
## glm(formula = y ~ ., family = binomial(), data = data_sinta_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.5488  -0.5079  -0.3080   0.0120   2.6593  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                        -6.4375896  1.2829283  -5.018 5.22e-07 ***
## Rumpun_IlmuIlmu Sosial dan Politik  0.1966512  0.6426449   0.306 0.759602    
## Rumpun_IlmuKesehatan                1.7174950  0.6493347   2.645 0.008169 ** 
## Rumpun_IlmuMIPA                     2.1701394  0.6149380   3.529 0.000417 ***
## Rumpun_IlmuTeknik                   1.9467967  0.6205277   3.137 0.001705 ** 
## JenjangD3                           2.8227428  1.6993237   1.661 0.096694 .  
## JenjangD4                           2.6438339  1.2993506   2.035 0.041878 *  
## JenjangS1                           1.7074280  1.0887359   1.568 0.116819    
## JenjangProfesi                      1.0543093  1.2711730   0.829 0.406879    
## JenjangSp-1                         1.4468122  1.1564136   1.251 0.210891    
## JenjangS2                           2.6475805  1.0909377   2.427 0.015229 *  
## JenjangSp-2                         3.1776611  1.7940280   1.771 0.076520 .  
## JenjangS3                           2.7677093  1.1224746   2.466 0.013674 *  
## AkreditasiBaik                      0.0409923  0.4505642   0.091 0.927509    
## AkreditasiBaik Sekali               0.0789481  0.3736678   0.211 0.832670    
## AkreditasiUnggul                    0.1273243  0.3110881   0.409 0.682329    
## Jumlah_Dosen_Total                 -0.0171818  0.0107885  -1.593 0.111249    
## Jumlah_Mahasiswa                    0.0007409  0.0003917   1.891 0.058580 .  
## jumlah_artikel                      0.0278937  0.0021611  12.907  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1494.07  on 1313  degrees of freedom
## Residual deviance:  914.32  on 1295  degrees of freedom
## AIC: 952.32
## 
## Number of Fisher Scoring iterations: 6

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_reglog_1, data_sinta_train, type = "response")
prediksi_data_train <- as.factor(ifelse(prediksi_prob_data_train > 0.5,"1","0"))
eval_reglog_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_reglog_1_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 926 140
##          1  52 196
##                                           
##                Accuracy : 0.8539          
##                  95% CI : (0.8336, 0.8726)
##     No Information Rate : 0.7443          
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.58            
##                                           
##  Mcnemar's Test P-Value : 3.415e-10       
##                                           
##             Sensitivity : 0.5833          
##             Specificity : 0.9468          
##          Pos Pred Value : 0.7903          
##          Neg Pred Value : 0.8687          
##              Prevalence : 0.2557          
##          Detection Rate : 0.1492          
##    Detection Prevalence : 0.1887          
##       Balanced Accuracy : 0.7651          
##                                           
##        'Positive' Class : 1               
##

Sensitivity: kemampuan model dalam memprediksi kelas positif

Specificity: kemampuan model dalam memprediksi kelas negatif

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_reglog_1, data_sinta_test, type = "response")
prediksi_data_test <- as.factor(ifelse(prediksi_prob_data_test > 0.5,"1","0"))
eval_reglog_1 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_reglog_1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 389  48
##          1  29  95
##                                           
##                Accuracy : 0.8627          
##                  95% CI : (0.8315, 0.8901)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 6.93e-12        
##                                           
##                   Kappa : 0.6221          
##                                           
##  Mcnemar's Test P-Value : 0.04024         
##                                           
##             Sensitivity : 0.6643          
##             Specificity : 0.9306          
##          Pos Pred Value : 0.7661          
##          Neg Pred Value : 0.8902          
##              Prevalence : 0.2549          
##          Detection Rate : 0.1693          
##    Detection Prevalence : 0.2210          
##       Balanced Accuracy : 0.7975          
##                                           
##        'Positive' Class : 1               
##

Performa model pada data training dan data testing perlu diperhatikan untuk mengetahui adanya overfiting/underfiting

Overfiting terjadi ketika performa model pada data training jauh lebih tinggi jika dibandingkan dengan performa model pada data testing (mempelajari data terlalu baik)

Underfiting terjadi ketika performa model pada data testing jauh lebih tinggi jika dibandingkan dengan performa model pada data training (tidak mempelajari data dengan baik)

#fungsi utk membentuk plot ROC
rocplot=function(pred,truth, ...){
  predob=ROCR::prediction(pred,truth)
  perf=ROCR::performance(predob,"tpr","fpr")
  auc=ROCR::performance(predob,"auc")@y.values
  plot(perf,main = auc)
  
}

#ROC data training
rocplot(prediksi_prob_data_train,data_sinta_train$y)

#ROC data testing
rocplot(prediksi_prob_data_test,data_sinta_test$y)

#variable importance
vip(model_reglog_1, num_features = 50)

Seleksi Peubah

model_reglog_2 <- glm(y~Jenjang+Jumlah_Mahasiswa , data_sinta, family=binomial())
summary(model_reglog_2)

## 
## Call:
## glm(formula = y ~ Jenjang + Jumlah_Mahasiswa, family = binomial(), 
##     data = data_sinta)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1283  -0.6862  -0.6486   1.2289   2.4910  
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      -3.0284969  0.7239119  -4.184 2.87e-05 ***
## JenjangD3         0.7565670  1.2739870   0.594 0.552607    
## JenjangD4         2.2090085  0.8538768   2.587 0.009681 ** 
## JenjangS1         1.7220577  0.7308360   2.356 0.018459 *  
## JenjangProfesi    0.5643365  0.8903418   0.634 0.526184    
## JenjangSp-1       1.7218649  0.7541232   2.283 0.022415 *  
## JenjangS2         2.6014889  0.7301575   3.563 0.000367 ***
## JenjangSp-2       1.9318995  1.0911585   1.771 0.076643 .  
## JenjangS3         2.9123542  0.7376778   3.948 7.88e-05 ***
## Jumlah_Mahasiswa -0.0002643  0.0001638  -1.614 0.106591    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2131.0  on 1874  degrees of freedom
## Residual deviance: 1994.8  on 1865  degrees of freedom
## AIC: 2014.8
## 
## Number of Fisher Scoring iterations: 5

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_reglog_2, data_sinta_train, type = "response")
prediksi_data_train <- as.factor(ifelse(prediksi_prob_data_train > 0.5,"1","0"))
eval_reglog_2_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")

## Warning in confusionMatrix.default(prediksi_data_train, data_sinta_train$y, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

eval_reglog_2_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 978 336
##          1   0   0
##                                           
##                Accuracy : 0.7443          
##                  95% CI : (0.7198, 0.7677)
##     No Information Rate : 0.7443          
##     P-Value [Acc > NIR] : 0.5147          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.7443          
##              Prevalence : 0.2557          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
##

rocplot(prediksi_prob_data_train,data_sinta_train$y)

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_reglog_2, data_sinta_test, type = "response")
prediksi_data_test <- as.factor(ifelse(prediksi_prob_data_test > 0.5,"1","0"))
eval_reglog_2 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")

## Warning in confusionMatrix.default(prediksi_data_test, data_sinta_test$y, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.

eval_reglog_2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 418 143
##          1   0   0
##                                           
##                Accuracy : 0.7451          
##                  95% CI : (0.7069, 0.7807)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 0.5225          
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0000          
##             Specificity : 1.0000          
##          Pos Pred Value :    NaN          
##          Neg Pred Value : 0.7451          
##              Prevalence : 0.2549          
##          Detection Rate : 0.0000          
##    Detection Prevalence : 0.0000          
##       Balanced Accuracy : 0.5000          
##                                           
##        'Positive' Class : 1               
##

rocplot(prediksi_prob_data_test,data_sinta_test$y)

vip(model_reglog_2, num_features = 50)

Classification Tree

Model 1 Default

Model dengan hyperparameter minsplit dan cp default

model_tree_1 <- rpart(y ~., data = data_sinta_train, method = "class",
               control=rpart.control(minsplit = 20, cp=0))
rpart.plot(model_tree_1, extra = 4)

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_1, newdata=data_sinta_train, type = "class") 
eval_tree_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_1_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 906  81
##          1  72 255
##                                          
##                Accuracy : 0.8836         
##                  95% CI : (0.865, 0.9004)
##     No Information Rate : 0.7443         
##     P-Value [Acc > NIR] : <2e-16         
##                                          
##                   Kappa : 0.6914         
##                                          
##  Mcnemar's Test P-Value : 0.5178         
##                                          
##             Sensitivity : 0.7589         
##             Specificity : 0.9264         
##          Pos Pred Value : 0.7798         
##          Neg Pred Value : 0.9179         
##              Prevalence : 0.2557         
##          Detection Rate : 0.1941         
##    Detection Prevalence : 0.2489         
##       Balanced Accuracy : 0.8427         
##                                          
##        'Positive' Class : 1              
##

rocplot(prediksi_prob_data_train[,2],data_sinta_train$y)

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_1, newdata=data_sinta_test, type = "class") 
eval_tree_1 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 369  55
##          1  49  88
##                                           
##                Accuracy : 0.8146          
##                  95% CI : (0.7799, 0.8459)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 5.915e-05       
##                                           
##                   Kappa : 0.5051          
##                                           
##  Mcnemar's Test P-Value : 0.6239          
##                                           
##             Sensitivity : 0.6154          
##             Specificity : 0.8828          
##          Pos Pred Value : 0.6423          
##          Neg Pred Value : 0.8703          
##              Prevalence : 0.2549          
##          Detection Rate : 0.1569          
##    Detection Prevalence : 0.2442          
##       Balanced Accuracy : 0.7491          
##                                           
##        'Positive' Class : 1               
##

rocplot(prediksi_prob_data_test[,2],data_sinta_test$y)

vip(model_tree_1, num_features = 50)

Model 2

Model dengan hyperparameter minsplit dan cp yang ditentukan sendiri (minsplit=10 dan cp=0)

model_tree_2 <- rpart(y ~., data = data_sinta_train, method = "class",
               control=rpart.control(minsplit = 10, cp=0))
rpart.plot(model_tree_2)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_2, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_2, newdata=data_sinta_train, type = "class") 
eval_tree_2_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_2_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 926  59
##          1  52 277
##                                         
##                Accuracy : 0.9155        
##                  95% CI : (0.8992, 0.93)
##     No Information Rate : 0.7443        
##     P-Value [Acc > NIR] : <2e-16        
##                                         
##                   Kappa : 0.7765        
##                                         
##  Mcnemar's Test P-Value : 0.569         
##                                         
##             Sensitivity : 0.8244        
##             Specificity : 0.9468        
##          Pos Pred Value : 0.8419        
##          Neg Pred Value : 0.9401        
##              Prevalence : 0.2557        
##          Detection Rate : 0.2108        
##    Detection Prevalence : 0.2504        
##       Balanced Accuracy : 0.8856        
##                                         
##        'Positive' Class : 1             
##

ROC_model_tree_2_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_tree_2_train)

ROC_model_tree_2_train$AUC

## [1] 0.9640179

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_2, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_2, newdata=data_sinta_test, type = "class") 
eval_tree_2 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_2

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 357  54
##          1  61  89
##                                           
##                Accuracy : 0.795           
##                  95% CI : (0.7592, 0.8277)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 0.003282        
##                                           
##                   Kappa : 0.4689          
##                                           
##  Mcnemar's Test P-Value : 0.575819        
##                                           
##             Sensitivity : 0.6224          
##             Specificity : 0.8541          
##          Pos Pred Value : 0.5933          
##          Neg Pred Value : 0.8686          
##              Prevalence : 0.2549          
##          Detection Rate : 0.1586          
##    Detection Prevalence : 0.2674          
##       Balanced Accuracy : 0.7382          
##                                           
##        'Positive' Class : 1               
##

ROC_model_tree_2 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_tree_2)

ROC_model_tree_2$AUC

## [1] 0.8494914

vip(model_tree_2, num_features = 50)

Model 3 Tuning Minsplit

Model dengan hyperparameter minsplit optimum

#mencari minsplit optimum
set.seed(478)
akurasi.semua <- NULL

for(ulangan in 1:100){
  acak <- createDataPartition(data_sinta$y, p=0.7, list=FALSE)
  data_sinta_train <- data_sinta[acak,]
  data_sinta_test <- data_sinta[-acak,]

  for (k in 1:30){
  pohon <- rpart(y ~ ., 
                 data=data_sinta_train,
                 method='class',
                 control=rpart.control(minsplit = k, cp=0))
  prediksi.prob <- predict(pohon, data_sinta_test)
  prediksi <- ifelse(prediksi.prob > 0.5, "1", "0")[,2]
  akurasi <- mean(prediksi == data_sinta_test$y)
  akurasi.semua <- rbind(akurasi.semua, c(k, akurasi))
  }
}
mean.akurasi <- tapply(akurasi.semua[,2], akurasi.semua[,1], mean)
plot(names(mean.akurasi),mean.akurasi, type="b", xlab="minsplit", ylab="rata-rata akurasi data testing")

model_tree_3 <- rpart(y ~., data = data_sinta_train, method = "class",
               control=rpart.control(minsplit = 11, cp=0))
rpart.plot(model_tree_3, extra=4)

## Warning: labs do not fit even at cex 0.15, there may be some overplotting

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_3, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_3, newdata=data_sinta_train, type = "class") 
eval_tree_3_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_3_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 938  61
##          1  40 275
##                                          
##                Accuracy : 0.9231         
##                  95% CI : (0.9074, 0.937)
##     No Information Rate : 0.7443         
##     P-Value [Acc > NIR] : < 2e-16        
##                                          
##                   Kappa : 0.7938         
##                                          
##  Mcnemar's Test P-Value : 0.04658        
##                                          
##             Sensitivity : 0.8185         
##             Specificity : 0.9591         
##          Pos Pred Value : 0.8730         
##          Neg Pred Value : 0.9389         
##              Prevalence : 0.2557         
##          Detection Rate : 0.2093         
##    Detection Prevalence : 0.2397         
##       Balanced Accuracy : 0.8888         
##                                          
##        'Positive' Class : 1              
##

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_3, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_3, newdata=data_sinta_test, type = "class") 
eval_tree_3 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_3

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 369  60
##          1  49  83
##                                           
##                Accuracy : 0.8057          
##                  95% CI : (0.7705, 0.8377)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 0.0004311       
##                                           
##                   Kappa : 0.4752          
##                                           
##  Mcnemar's Test P-Value : 0.3381504       
##                                           
##             Sensitivity : 0.5804          
##             Specificity : 0.8828          
##          Pos Pred Value : 0.6288          
##          Neg Pred Value : 0.8601          
##              Prevalence : 0.2549          
##          Detection Rate : 0.1480          
##    Detection Prevalence : 0.2353          
##       Balanced Accuracy : 0.7316          
##                                           
##        'Positive' Class : 1               
##

vip(model_tree_3, num_features = 50)

Model 4 Opsi CP

Model dengan hyperparameter cp optimum

set.seed(478)
model_tree_4 <- rpart(y ~ ., data=data_sinta_train,
               method='class',
               control=rpart.control(minsplit = 20, cp=0))
printcp(model_tree_4)

## 
## Classification tree:
## rpart(formula = y ~ ., data = data_sinta_train, method = "class", 
##     control = rpart.control(minsplit = 20, cp = 0))
## 
## Variables actually used in tree construction:
## [1] Akreditasi         Jenjang            jumlah_artikel     Jumlah_Dosen_Total
## [5] Jumlah_Mahasiswa   Rumpun_Ilmu       
## 
## Root node error: 336/1314 = 0.25571
## 
## n= 1314 
## 
##            CP nsplit rel error  xerror     xstd
## 1  0.39285714      0   1.00000 1.00000 0.047065
## 2  0.10714286      1   0.60714 0.61607 0.039303
## 3  0.00793651      2   0.50000 0.51488 0.036478
## 4  0.00595238      5   0.47619 0.56250 0.037859
## 5  0.00496032     12   0.43155 0.55655 0.037692
## 6  0.00396825     15   0.41667 0.55655 0.037692
## 7  0.00297619     18   0.40476 0.54762 0.037438
## 8  0.00148810     20   0.39881 0.59226 0.038675
## 9  0.00099206     24   0.39286 0.59226 0.038675
## 10 0.00000000     27   0.38988 0.59821 0.038834

model_tree_4 <- rpart(y ~ ., data=data_sinta_train,
               method='class',
               control=rpart.control(minsplit = 20, cp=0.0066225))
rpart.plot(model_tree_4)

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_4, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_4, newdata=data_sinta_train, type = "class") 
eval_tree_4_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_4_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 894  76
##          1  84 260
##                                           
##                Accuracy : 0.8782          
##                  95% CI : (0.8593, 0.8954)
##     No Information Rate : 0.7443          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.6826          
##                                           
##  Mcnemar's Test P-Value : 0.58            
##                                           
##             Sensitivity : 0.7738          
##             Specificity : 0.9141          
##          Pos Pred Value : 0.7558          
##          Neg Pred Value : 0.9216          
##              Prevalence : 0.2557          
##          Detection Rate : 0.1979          
##    Detection Prevalence : 0.2618          
##       Balanced Accuracy : 0.8440          
##                                           
##        'Positive' Class : 1               
##

ROC_model_tree_4_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_tree_4_train)

ROC_model_tree_4_train$AUC

## [1] 0.8952399

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_4, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_4, newdata=data_sinta_test, type = "class") 
eval_tree_4 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_4

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 371  45
##          1  47  98
##                                           
##                Accuracy : 0.836           
##                  95% CI : (0.8027, 0.8657)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 1.553e-07       
##                                           
##                   Kappa : 0.5703          
##                                           
##  Mcnemar's Test P-Value : 0.917           
##                                           
##             Sensitivity : 0.6853          
##             Specificity : 0.8876          
##          Pos Pred Value : 0.6759          
##          Neg Pred Value : 0.8918          
##              Prevalence : 0.2549          
##          Detection Rate : 0.1747          
##    Detection Prevalence : 0.2585          
##       Balanced Accuracy : 0.7864          
##                                           
##        'Positive' Class : 1               
##

ROC_model_tree_4 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_tree_4)

ROC_model_tree_4$AUC

## [1] 0.8122511

vip(model_tree_4, num_features = 50)

Bagging

Model Default

Model dengan hyperparameter nbagg default dan tree default

model_bag_1 <- ipred::bagging(y ~ ., data=data_sinta_train, coob = TRUE,
                              nbagg=25, 
                              control= rpart.control(minsplit=2, cp=0))
model_bag_1

## 
## Bagging classification trees with 25 bootstrap replications 
## 
## Call: bagging.data.frame(formula = y ~ ., data = data_sinta_train, 
##     coob = TRUE, nbagg = 25, control = rpart.control(minsplit = 2, 
##         cp = 0))
## 
## Out-of-bag estimate of misclassification error:  0.1461

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_bag_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_bag_1, data_sinta_train,type="class")
eval_model_bag_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_model_bag_1_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 975   1
##          1   3 335
##                                           
##                Accuracy : 0.997           
##                  95% CI : (0.9922, 0.9992)
##     No Information Rate : 0.7443          
##     P-Value [Acc > NIR] : <2e-16          
##                                           
##                   Kappa : 0.992           
##                                           
##  Mcnemar's Test P-Value : 0.6171          
##                                           
##             Sensitivity : 0.9970          
##             Specificity : 0.9969          
##          Pos Pred Value : 0.9911          
##          Neg Pred Value : 0.9990          
##              Prevalence : 0.2557          
##          Detection Rate : 0.2549          
##    Detection Prevalence : 0.2572          
##       Balanced Accuracy : 0.9970          
##                                           
##        'Positive' Class : 1               
##

ROC_model_bag_1_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_bag_1_train)

ROC_model_bag_1_train$AUC

## [1] 0.9998783

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_bag_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_bag_1, data_sinta_test,type="class")
eval_model_bag_1<- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_model_bag_1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 368  66
##          1  50  77
##                                          
##                Accuracy : 0.7932         
##                  95% CI : (0.7573, 0.826)
##     No Information Rate : 0.7451         
##     P-Value [Acc > NIR] : 0.00444        
##                                          
##                   Kappa : 0.4348         
##                                          
##  Mcnemar's Test P-Value : 0.16371        
##                                          
##             Sensitivity : 0.5385         
##             Specificity : 0.8804         
##          Pos Pred Value : 0.6063         
##          Neg Pred Value : 0.8479         
##              Prevalence : 0.2549         
##          Detection Rate : 0.1373         
##    Detection Prevalence : 0.2264         
##       Balanced Accuracy : 0.7094         
##                                          
##        'Positive' Class : 1              
##

ROC_model_bag_1 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_bag_1)

ROC_model_bag_1$AUC

## [1] 0.8295747

Random Forest

Model 1 Default

Model dengan hyperparameter ntree, mtry default

model_rf_1 <- randomForest::randomForest(y ~ ., ntree=500,
                                         data=data_sinta_train)

# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_rf_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_rf_1, data_sinta_train,type="class")
eval_model_rf_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_model_rf_1_train

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 969  21
##          1   9 315
##                                           
##                Accuracy : 0.9772          
##                  95% CI : (0.9676, 0.9845)
##     No Information Rate : 0.7443          
##     P-Value [Acc > NIR] : < 2e-16         
##                                           
##                   Kappa : 0.9393          
##                                           
##  Mcnemar's Test P-Value : 0.04461         
##                                           
##             Sensitivity : 0.9375          
##             Specificity : 0.9908          
##          Pos Pred Value : 0.9722          
##          Neg Pred Value : 0.9788          
##              Prevalence : 0.2557          
##          Detection Rate : 0.2397          
##    Detection Prevalence : 0.2466          
##       Balanced Accuracy : 0.9641          
##                                           
##        'Positive' Class : 1               
##

ROC_model_rf_1_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_rf_1_train)

ROC_model_rf_1_train$AUC

## [1] 0.9977907

# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_rf_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_rf_1, data_sinta_test,type="class")
eval_model_rf_1<- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_model_rf_1

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 377  61
##          1  41  82
##                                           
##                Accuracy : 0.8182          
##                  95% CI : (0.7837, 0.8492)
##     No Information Rate : 0.7451          
##     P-Value [Acc > NIR] : 2.471e-05       
##                                           
##                   Kappa : 0.4983          
##                                           
##  Mcnemar's Test P-Value : 0.05993         
##                                           
##             Sensitivity : 0.5734          
##             Specificity : 0.9019          
##          Pos Pred Value : 0.6667          
##          Neg Pred Value : 0.8607          
##              Prevalence : 0.2549          
##          Detection Rate : 0.1462          
##    Detection Prevalence : 0.2193          
##       Balanced Accuracy : 0.7377          
##                                           
##        'Positive' Class : 1               
##

ROC_model_rf_1 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_rf_1)

ROC_model_rf_1$AUC

## [1] 0.8612524

vip(model_rf_1, num_features = 50)

Perbandingan Hasil Model

hasil_eval <- rbind(
  c(eval_reglog_1$overall[1], eval_reglog_1$byClass[1], eval_reglog_1$byClass[2]),
  c(eval_reglog_2$overall[1], eval_reglog_2$byClass[1], eval_reglog_2$byClass[2]),
  c(eval_tree_1$overall[1], eval_tree_1$byClass[1], eval_tree_1$byClass[2]),
  c(eval_tree_2$overall[1], eval_tree_2$byClass[1], eval_tree_2$byClass[2]),
  c(eval_tree_3$overall[1], eval_tree_3$byClass[1], eval_tree_3$byClass[2]),
  c(eval_tree_4$overall[1], eval_tree_4$byClass[1], eval_tree_4$byClass[2]),
  c(eval_model_bag_1$overall[1], eval_model_bag_1$byClass[1], eval_model_bag_1$byClass[2]),
  c(eval_model_rf_1$overall[1], eval_model_rf_1$byClass[1], eval_model_rf_1$byClass[2]))
row.names(hasil_eval) <- 
  c("RegLog Semua Peubah","RegLog Seleksi Peubah",
    "ClassTree 1","ClassTree 2","ClassTree 3","ClassTree 4",
    "Bagging 1", "RandomForest 1")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))

##                        Accuracy Sensitivity Specificity
## RegLog Semua Peubah   0.8627451   0.6643357   0.9306220
## ClassTree 4           0.8360071   0.6853147   0.8875598
## RandomForest 1        0.8181818   0.5734266   0.9019139
## ClassTree 1           0.8146168   0.6153846   0.8827751
## ClassTree 3           0.8057041   0.5804196   0.8827751
## ClassTree 2           0.7950089   0.6223776   0.8540670
## Bagging 1             0.7932264   0.5384615   0.8803828
## RegLog Seleksi Peubah 0.7450980   0.0000000   1.0000000