R04STA1381: Analisis Data Bagian 1
package yang dibutuhkan:
Library
library(tidyverse)
library(kableExtra)
library(ggplot2)
library(ggthemes)
library(stringr)
library(reshape2)
library(mice)
library(nortest)
library(DescTools)
library(caret)
library(rpart)
library(rpart.plot)
library(ROCit)
library(PRROC)
library(ROCR)
library(vip)Data
Data Formatting
df_authors_ok$Level <- as.factor(df_authors_ok$Level)
df_authors_ok$Prodi <- as.factor(df_authors_ok$Prodi)
df_authors_ok$Status <- as.factor(df_authors_ok$Status)
df_authors_ok$Akreditasi <- as.factor(df_authors_ok$Akreditasi)#struktur data setelah formating
glimpse(df_authors_ok[,c("Level", "Prodi", "Status", "Akreditasi")])## Rows: 3,556
## Columns: 4
## $ Level <fct> S1, S1, S2, S1, S2, S2, S2, S2, S2, S2, S3, S3, S2, S2, S1,~
## $ Prodi <fct> "Sistem Informasi", "Teknik Kimia", "Matematika", "Teknolog~
## $ Status <fct> Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Aktif, Akt~
## $ Akreditasi <fct> Unggul, Unggul, B, Unggul, A, Unggul, Baik, A, A, B, Unggul~
Re-Level Factor
levels(df_authors_ok$Level) #level awal## [1] "D3" "D4" "Profesi" "S1" "S2" "S3" "Sp-1"
## [8] "Sp-2" "Unknown"
df_authors_ok$Level <- factor(df_authors_ok$Level,levels(df_authors_ok$Level)[c(1,2,5,6,3,4)]) #re-level
levels(df_authors_ok$Level) #setelah re-level## [1] "D3" "D4" "S2" "S3" "Profesi" "S1"
levels(df_authors_ok$Akreditasi)## [1] "-" "A" "B"
## [4] "Baik" "Baik Sekali" "Tidak Terakreditasi"
## [7] "Unggul"
Rumpun Ilmu dari Prodi
#Membentuk rumpun ilmu berdasarkan kode prodi 2 digit
df_rumpun <- df_authors_ok %>%
select(Kode_Prodi,Prodi) %>%
group_by(Kode_Prodi,Prodi) %>%
summarize() %>%
mutate(Kode_Prodi_2Digit = substr(Kode_Prodi,1,2)) %>%
na.omit()## `summarise()` has grouped output by 'Kode_Prodi'. You can override using the `.groups` argument.
df_rumpun## # A tibble: 256 x 3
## # Groups: Kode_Prodi [256]
## Kode_Prodi Prodi Kode_Prodi_2Digit
## <dbl> <fct> <chr>
## 1 11001 Ilmu Kedokteran 11
## 2 11002 Ilmu Biomedik 11
## 3 11101 Ilmu Biomedik 11
## 4 11108 Kedokteran Kerja 11
## 5 11109 Pendidikan Kedokteran 11
## 6 11123 Teknologi Biomedis 11
## 7 11201 Pendidikan Dokter 11
## 8 11301 Fisioterapi 11
## 9 11303 Terapi Okupasi 11
## 10 11401 Fisioterapi 11
## # ... with 246 more rows
#Membentuk rumpun ilmu berdasarkan kode prodi 2 digit
df_rumpun <- df_rumpun %>%
mutate(Rumpun_Ilmu = case_when(Kode_Prodi_2Digit==14 ~ "Kesehatan",
Kode_Prodi_2Digit %in% c(20,21,22,31,55,57) ~ "Teknik",
Kode_Prodi_2Digit %in% c(44:51) ~ "MIPA",
Kode_Prodi_2Digit %in% c(60:63,93) ~ "Ekonomi",
Kode_Prodi_2Digit %in% c(71:79) ~ "Bahasa",
Kode_Prodi_2Digit %in% c(80:89,94,95) ~ "Pendidikan",
Kode_Prodi_2Digit %in% c(90) ~ "Seni,Desain,Media"
))
df_rumpun## # A tibble: 256 x 4
## # Groups: Kode_Prodi [256]
## Kode_Prodi Prodi Kode_Prodi_2Digit Rumpun_Ilmu
## <dbl> <fct> <chr> <chr>
## 1 11001 Ilmu Kedokteran 11 <NA>
## 2 11002 Ilmu Biomedik 11 <NA>
## 3 11101 Ilmu Biomedik 11 <NA>
## 4 11108 Kedokteran Kerja 11 <NA>
## 5 11109 Pendidikan Kedokteran 11 <NA>
## 6 11123 Teknologi Biomedis 11 <NA>
## 7 11201 Pendidikan Dokter 11 <NA>
## 8 11301 Fisioterapi 11 <NA>
## 9 11303 Terapi Okupasi 11 <NA>
## 10 11401 Fisioterapi 11 <NA>
## # ... with 246 more rows
#dataframe rumpun ilmu yang akan digunakan untuk di merge dengan data awal
df_rumpun_oke <- df_rumpun %>% select(Kode_Prodi,Rumpun_Ilmu)Analisis
#struktur data
glimpse(df_authors_ok)## Rows: 3,556
## Columns: 40
## $ SINTA_ID <dbl> 259819, 6027567, 6059011, 5982895, 60106~
## $ Nama <chr> "ACHMAD NIZAR HIDAYANTO", "ENY KUSRINI",~
## $ Universitas <chr> "Universitas Indonesia", "Universitas In~
## $ Kode_Prodi <dbl> 57201, 24201, 44101, 25202, 22101, 95129~
## $ Departemen <chr> "S1 - Sistem Informasi", "S1 - Teknik Ki~
## $ Level <fct> S1, S1, S2, S1, S2, S2, S2, S2, S2, S2, ~
## $ Prodi <fct> "Sistem Informasi", "Teknik Kimia", "Mat~
## $ SINTA_Score_Overall <dbl> 8816, 7785, 3579, 4879, 6618, 3367, 3367~
## $ SINTA_Score_3Yr <dbl> 3547, 2755, 2570, 2317, 2239, 2176, 2176~
## $ Affil_Score <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Affil_Score_3Yr <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ Scopus_Artikel <dbl> 370, 154, 156, 177, 162, 124, 124, 235, ~
## $ Scopus_Citation <dbl> 1521, 1036, 809, 707, 1199, 147, 147, 64~
## $ Scopus_Cited_Document <dbl> 247, 127, 116, 122, 112, 53, 53, 160, 13~
## $ Scopus_H_Index <dbl> 17, 15, 16, 14, 15, 6, 6, 11, 17, 14, 8,~
## $ Scopus_i10_Index <dbl> 41, 34, 28, 23, 34, 3, 3, 14, 34, 22, 5,~
## $ Scopus_G_Index <dbl> 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 2, 1~
## $ GScholar_Artikel <dbl> 468, 233, 205, 273, 243, 182, 182, 400, ~
## $ GScholar_Citation <dbl> 3328, 1399, 1143, 1117, 2560, 382, 382, ~
## $ GScholar_Cited_Document <dbl> 329, 147, 135, 149, 159, 93, 93, 270, 13~
## $ GScholar_H_Index <dbl> 26, 17, 18, 18, 26, 10, 10, 22, 20, 16, ~
## $ GScholar_i10_Index <dbl> 105, 43, 42, 40, 60, 12, 12, 72, 40, 26,~
## $ GScholar_G_Index <dbl> 1, 1, 2, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1~
## $ WOS_Artikel <dbl> 129, 99, 0, 0, 68, 43, 43, 0, 94, 62, 0,~
## $ WOS_Citation <dbl> 423, 785, 0, 0, 661, 45, 45, 0, 771, 332~
## $ WOS_Cited_Document <dbl> 91, 91, 0, 0, 55, 20, 20, 0, 73, 47, 0, ~
## $ WOS_H_Index <dbl> 10, 14, NA, NA, 12, 3, 3, NA, 14, 9, NA,~
## $ WOS_i10_Index <dbl> 12, 27, NA, NA, 14, 0, 0, NA, 18, 9, NA,~
## $ WOS_G_Index <dbl> 3, 14, NA, NA, 11, 1, 1, NA, 1, 7, NA, N~
## $ Status <fct> Aktif, Aktif, Aktif, Aktif, Aktif, Aktif~
## $ Akreditasi <fct> Unggul, Unggul, B, Unggul, A, Unggul, Ba~
## $ Jumlah_Dosen_Penghitung_Rasio <dbl> 29, 38, 14, 23, 34, 29, 29, 23, 35, 14, ~
## $ Jumlah_Dosen_NIDN <dbl> 16, 14, 7, 7, 4, 5, 5, 12, 6, 7, 7, 7, 6~
## $ Jumlah_Dosen_NIDK <dbl> 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0~
## $ Jumlah_Dosen_Total <dbl> 16, 15, 7, 7, 5, 6, 6, 12, 6, 7, 7, 7, 6~
## $ Jumlah_Mahasiswa <dbl> 815, 591, 55, 225, 476, 206, 206, 403, 9~
## $ Rasio_Dosen_per_Mahasiswa <chr> "1 : 28.10", "1 : 15.55", "1 : 3.93", "1~
## $ Rasio_Dosen_Per_Mahasiswa <dbl> 3.558282, 6.429780, 25.454545, 10.222222~
## $ jumlah_artikel <dbl> 967, 486, 361, 450, 473, 349, 349, 635, ~
## $ keterangan <chr> "Diatas Median", "Diatas Median", "Diata~
Data
Unit Observasi = Authors
y = SINTA_Score_3Yr yang dikategorisasi menjadi tinggi
dan rendah
x1 = Rumpun Ilmu (Ganjil 2021)
x2 = Level (Ganjil 2021)
x3 = Akreditasi (Ganjil 2021)
x4 = Total Jumlah Dosen (Ganjil 2021)
x5 = Jumlah Mahasiswa (Ganjil 2021)
x6 = Rasio Dosen per Mahasiswa (Ganjil 2021)
data_1 <- df_authors_ok %>%
left_join(df_rumpun_oke, by="Kode_Prodi") %>%
select(SINTA_Score_3Yr,Prodi,Rumpun_Ilmu,Level,Akreditasi,Jumlah_Dosen_Total,Jumlah_Mahasiswa) %>%
mutate(Rasio_Dosen_per_Mahasiswa = df_authors_ok$Jumlah_Dosen_Penghitung_Rasio/df_authors_ok$Jumlah_Mahasiswa,
y = ifelse(SINTA_Score_3Yr>=239,"1","0")) #kelas 1:SINTA_Score_3Yr yang tinggi
data_1$y <- as.factor(data_1$y)
data_1$Rumpun_Ilmu <- as.factor(data_1$Rumpun_Ilmu)
summary(data_1)## SINTA_Score_3Yr Prodi Rumpun_Ilmu
## Min. : 0.0 Ilmu Hukum : 88 Bahasa : 422
## 1st Qu.: 1.0 Pendidikan Dokter : 70 Ekonomi : 382
## Median : 37.0 Psikologi : 67 MIPA : 188
## Mean : 141.1 Obstetri dan Ginekologi: 64 Teknik : 187
## 3rd Qu.: 161.0 Akuntansi : 57 Pendidikan: 135
## Max. :3547.0 (Other) :2255 (Other) : 54
## NA's : 955 NA's :2188
## Level Akreditasi Jumlah_Dosen_Total Jumlah_Mahasiswa
## D3 : 62 A :1538 Min. : 0.00 Min. : 0.0
## D4 : 18 Unggul : 743 1st Qu.: 6.00 1st Qu.: 66.0
## S2 : 637 - : 140 Median :10.00 Median : 213.0
## S3 : 269 B : 110 Mean :16.85 Mean : 375.3
## Profesi: 79 Baik : 55 3rd Qu.:19.00 3rd Qu.: 503.0
## S1 : 985 (Other): 35 Max. :71.00 Max. :2290.0
## NA's :1506 NA's : 935 NA's :935 NA's :935
## Rasio_Dosen_per_Mahasiswa y
## Min. :0.0123 0:2915
## 1st Qu.:0.0666 1: 641
## Median :0.1346
## Mean :0.2373
## 3rd Qu.:0.3490
## Max. :1.4000
## NA's :990
#Cek missing values
md.pattern(data_1,rotate.names = TRUE)## SINTA_Score_3Yr y Akreditasi Jumlah_Dosen_Total Jumlah_Mahasiswa Prodi
## 1325 1 1 1 1 1 1
## 710 1 1 1 1 1 1
## 502 1 1 1 1 1 1
## 15 1 1 1 1 1 1
## 40 1 1 1 1 1 1
## 28 1 1 1 1 1 0
## 1 1 1 1 1 1 0
## 9 1 1 0 0 0 1
## 926 1 1 0 0 0 0
## 0 0 935 935 935 955
## Rasio_Dosen_per_Mahasiswa Level Rumpun_Ilmu
## 1325 1 1 1 0
## 710 1 1 0 1
## 502 1 0 0 2
## 15 0 1 1 1
## 40 0 0 0 3
## 28 1 0 1 2
## 1 1 0 0 3
## 9 0 0 0 6
## 926 0 0 0 7
## 990 1506 2188 8444
data_1 <- data_1 %>% filter(!is.na(Prodi),!is.na(Level),!is.na(Akreditasi),!is.na(Rumpun_Ilmu),
Rasio_Dosen_per_Mahasiswa!=Inf)
head(data_1) #data yang akan digunakan## # A tibble: 6 x 9
## SINTA_Score_3Yr Prodi Rumpun_Ilmu Level Akreditasi Jumlah_Dosen_Tot~
## <dbl> <fct> <fct> <fct> <fct> <dbl>
## 1 3547 Sistem Informa~ Teknik S1 Unggul 16
## 2 2570 Matematika MIPA S2 B 7
## 3 2239 Teknik Sipil Teknik S2 A 5
## 4 2176 Ilmu Lingkungan Pendidikan S2 Unggul 6
## 5 2176 Ilmu Lingkungan Pendidikan S2 Baik 6
## 6 2007 Teknologi Info~ Teknik S2 A 12
## # ... with 3 more variables: Jumlah_Mahasiswa <dbl>,
## # Rasio_Dosen_per_Mahasiswa <dbl>, y <fct>
summary(data_1)## SINTA_Score_3Yr Prodi Rumpun_Ilmu Level
## Min. : 0.0 Ilmu Hukum : 88 Bahasa :422 D3 : 34
## 1st Qu.: 3.0 Psikologi : 67 Ekonomi :366 D4 : 12
## Median : 41.0 Akuntansi : 57 Kesehatan : 44 S2 :413
## Mean : 165.8 Ilmu Ekonomi : 56 MIPA :188 S3 :168
## 3rd Qu.: 175.0 Manajemen : 43 Pendidikan :135 Profesi: 24
## Max. :3547.0 Ilmu Komputer: 39 Seni,Desain,Media: 9 S1 :674
## (Other) :975 Teknik :161
## Akreditasi Jumlah_Dosen_Total Jumlah_Mahasiswa
## - : 48 Min. : 2.00 Min. : 8.0
## A :717 1st Qu.: 6.00 1st Qu.: 115.0
## B : 34 Median :11.00 Median : 311.0
## Baik : 21 Mean :18.31 Mean : 523.8
## Baik Sekali : 27 3rd Qu.:20.00 3rd Qu.: 643.0
## Tidak Terakreditasi: 8 Max. :71.00 Max. :2290.0
## Unggul :470
## Rasio_Dosen_per_Mahasiswa y
## Min. :0.01235 0:1062
## 1st Qu.:0.05643 1: 263
## Median :0.07849
## Mean :0.14571
## 3rd Qu.:0.17647
## Max. :0.80000
##
EDA Data
Peubah Respon (y)
#format data yang dibutuhkan
data_chart <- data_1 %>%
group_by(y) %>%
summarize(value=n()) %>%
mutate(prop = round(value / sum(value) *100, digits = 2))
#pie chart: Sebaran Authors Berdasarkan Kategori SINTA_Score_3Yr
ggplot(data_chart, aes(x="", y=prop, fill=y)) +
geom_bar(stat="identity", width=1, color="white") +
coord_polar("y", start=0) +
labs(title= "Proporsi Authors Menurut Kategori SINTA_Score_3Yr",
subtitle = "Universitas Indonesia") +
theme_void() Peubah Prediktor (X) Numerik
#Density Jumlah_Dosen_Total
ggplot(data_1, aes(x=Jumlah_Dosen_Total)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=15)+
theme_light() +
labs(x="Jumlah_Dosen_Total",
y="Density",
title= "Sebaran Jumlah_Dosen_Total",
subtitle = "Universitas Indonesia") #Density Jumlah_Mahasiswa
ggplot(data_1, aes(x=Jumlah_Mahasiswa)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=15)+
theme_light() +
labs(x="Jumlah_Mahasiswa",
y="Density",
title= "Sebaran Jumlah_Mahasiswa",
subtitle = "Universitas Indonesia") #Density Rasio_Dosen_per_Mahasiswa
ggplot(data_1, aes(x=Rasio_Dosen_per_Mahasiswa)) +
geom_histogram(fill="#69b3a2", color="#e9ecef", alpha=0.8, bins=20)+
theme_light() +
labs(x="Rasio_Dosen_per_Mahasiswa",
y="Density",
title= "Sebaran Rasio_Dosen_per_Mahasiswa",
subtitle = "Universitas Indonesia")
### Peubah Prediktor (X) Kategorik
# Akreditasi
data_bar_chart = data_1 %>%
group_by(Akreditasi)%>%
summarize(Jumlah=n())
ggplot(data_bar_chart, aes(x=Akreditasi, y=Jumlah)) +
geom_bar(stat = "identity",color="steelblue") +
theme_light() +
labs(x="",
y="",
title= "",
subtitle = "Universitas Indonesia") +
coord_flip()# Level
data_bar_chart = data_1 %>%
group_by(Level)%>%
summarize(Jumlah=n())
ggplot(data_bar_chart, aes(x=Level, y=Jumlah)) +
geom_bar(stat = "identity",color="steelblue") +
theme_light() +
labs(x="",
y="",
title= "",
subtitle = "Universitas Indonesia") +
coord_flip()# Rumpun Ilmu
data_bar_chart = data_1 %>%
group_by(Rumpun_Ilmu)%>%
summarize(Jumlah=n())
ggplot(data_bar_chart, aes(x=(Rumpun_Ilmu), y=Jumlah)) +
geom_bar(stat = "identity",color="steelblue") +
theme_light() +
labs(x="",
y="",
title= "",
subtitle = "Universitas Indonesia") +
coord_flip()Hubungan Peubah Prediktor dengan Peubah Respon
# Akreditasi & y
percentData <- data_1 %>%
group_by(Akreditasi) %>%
count(y) %>%
mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Akreditasi),fill=y,))+
geom_bar(position="fill")+
scale_fill_manual(values=c("#7be217", "#4f58ab"))+
geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
labs(
y = "",
x = "Akreditasi",
subtitle = "UI",
title = "Proporsi Peubah Respon Menurut Akreditasi")# Level & y
percentData <- data_1 %>%
group_by(Level) %>%
count(y) %>%
mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Level),fill=y,))+
geom_bar(position="fill")+
scale_fill_manual(values=c("#7be217", "#4f58ab"))+
geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
labs(
y = "",
x = "Level",
subtitle = "UI",
title = "Proporsi Peubah Respon Menurut Level")# Rumpun_Ilmu & y
percentData <- data_1 %>%
group_by(Rumpun_Ilmu) %>%
count(y) %>%
mutate(ratio=scales::percent(n/sum(n)))
ggplot(data_1,aes(x=factor(Rumpun_Ilmu),fill=y,))+
geom_bar(position="fill")+
scale_fill_manual(values=c("#7be217", "#4f58ab"))+
geom_text(data=percentData, aes(y=n,label=ratio), color="white",position=position_fill(vjust=0.5))+
labs(
y = "",
x = "Rumpun_Ilmu",
subtitle = "UI",
title = "Proporsi Peubah Respon Menurut Rumpun Ilmu")# Jumlah_Mahasiswa & y
#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Jumlah_Mahasiswa,fill=Jumlah_Mahasiswa,alpha=Jumlah_Mahasiswa)) +
geom_boxplot(fill="#69b3a2", alpha=0.8) +
theme_light() +
labs(x="Jumlah_Mahasiswa",
y="y",
title= "Sebaran Jumlah Mahasiswa Menurut Peubah Respon",
subtitle = "Universitas Indonesia") # Jumlah_Dosen_Total & y
#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Jumlah_Dosen_Total,fill=Jumlah_Dosen_Total,alpha=Jumlah_Dosen_Total)) +
geom_boxplot(fill="#69b3a2", alpha=0.8) +
theme_light() +
labs(x="Jumlah_Dosen_Total",
y="y",
title= "Sebaran Jumlah Dosen Total Menurut Peubah Respon",
subtitle = "Universitas Indonesia") # Rasio_Dosen_per_Mahasiswa & y
#Boxplot by kategori
ggplot(data_1, aes(y=y,x=Rasio_Dosen_per_Mahasiswa,fill=Rasio_Dosen_per_Mahasiswa,alpha=Rasio_Dosen_per_Mahasiswa)) +
geom_boxplot(fill="#69b3a2", alpha=0.8) +
theme_light() +
labs(x="Rasio_Dosen_per_Mahasiswa",
y="y",
title= "Sebaran Rasio Dosen per Mahasiswa Menurut Peubah Respon",
subtitle = "Universitas Indonesia") Data Model
#data yang akan digunakan untuk model
data_sinta <- data_1 %>% select(-c(SINTA_Score_3Yr,Prodi))
str(data_sinta)## tibble [2,035 x 7] (S3: tbl_df/tbl/data.frame)
## $ Rumpun_Ilmu : Factor w/ 7 levels "Bahasa","Ekonomi",..: 7 NA 4 NA 7 5 5 7 7 4 ...
## $ Level : Factor w/ 6 levels "D3","D4","S2",..: 6 6 3 6 3 3 3 3 3 3 ...
## $ Akreditasi : Factor w/ 7 levels "-","A","B","Baik",..: 7 7 3 7 2 7 4 2 2 3 ...
## $ Jumlah_Dosen_Total : num [1:2035] 16 15 7 7 5 6 6 12 6 7 ...
## $ Jumlah_Mahasiswa : num [1:2035] 815 591 55 225 476 206 206 403 91 55 ...
## $ Rasio_Dosen_per_Mahasiswa: num [1:2035] 0.0356 0.0643 0.2545 0.1022 0.0714 ...
## $ y : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
Splitting Data
set.seed(478)
in.train <- createDataPartition(as.factor(data_sinta$y),p=0.7,list=F) #partisi data
data_sinta_train <- data_sinta[in.train,] #data training utk modelling
data_sinta_test<- data_sinta[-in.train,] #data testing utk evaluasi model
#proporsi kelas peubah respon pada data
round(prop.table(table(data_sinta_train$y)), digits = 4)##
## 0 1
## 0.7707 0.2293
round(prop.table(table(data_sinta_test$y)), digits = 4)##
## 0 1
## 0.7718 0.2282
Regresi Logistik
Semua Peubah
model_reglog_1 <- glm(y~., data_sinta_train, family=binomial())## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
summary(model_reglog_1)##
## Call:
## glm(formula = y ~ ., family = binomial(), data = data_sinta_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.05878 -0.43306 -0.15604 -0.06321 2.97138
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.902e+01 8.340e+02 -0.023 0.981806
## Rumpun_IlmuEkonomi 2.162e+00 7.745e-01 2.792 0.005237 **
## Rumpun_IlmuKesehatan 4.345e+00 8.479e-01 5.124 2.98e-07 ***
## Rumpun_IlmuMIPA 5.293e+00 7.724e-01 6.852 7.30e-12 ***
## Rumpun_IlmuPendidikan 3.004e+00 8.030e-01 3.741 0.000183 ***
## Rumpun_IlmuSeni,Desain,Media -1.149e+01 1.601e+03 -0.007 0.994270
## Rumpun_IlmuTeknik 5.390e+00 7.855e-01 6.862 6.79e-12 ***
## LevelD4 4.401e-01 1.594e+03 0.000 0.999780
## LevelS2 1.546e+01 8.340e+02 0.019 0.985205
## LevelS3 1.585e+01 8.340e+02 0.019 0.984835
## LevelProfesi 1.240e+01 8.340e+02 0.015 0.988133
## LevelS1 1.465e+01 8.340e+02 0.018 0.985985
## AkreditasiA -4.600e-01 6.152e-01 -0.748 0.454685
## AkreditasiB 6.867e-01 8.626e-01 0.796 0.426004
## AkreditasiBaik -1.372e-01 8.411e-01 -0.163 0.870405
## AkreditasiBaik Sekali -9.258e-01 9.386e-01 -0.986 0.323953
## AkreditasiTidak Terakreditasi -1.713e+00 1.112e+00 -1.540 0.123615
## AkreditasiUnggul -7.209e-01 5.653e-01 -1.275 0.202204
## Jumlah_Dosen_Total 7.843e-03 3.401e-02 0.231 0.817616
## Jumlah_Mahasiswa -8.011e-04 1.104e-03 -0.726 0.468101
## Rasio_Dosen_per_Mahasiswa -2.117e+00 1.165e+00 -1.817 0.069213 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 898.54 on 920 degrees of freedom
## Residual deviance: 549.08 on 900 degrees of freedom
## (505 observations deleted due to missingness)
## AIC: 591.08
##
## Number of Fisher Scoring iterations: 16
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_reglog_1, data_sinta_train, type = "response")
prediksi_data_train <- as.factor(ifelse(prediksi_prob_data_train > 0.5,"1","0"))
eval_reglog_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_reglog_1_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 702 91
## 1 43 85
##
## Accuracy : 0.8545
## 95% CI : (0.8301, 0.8767)
## No Information Rate : 0.8089
## P-Value [Acc > NIR] : 0.0001675
##
## Kappa : 0.4747
##
## Mcnemar's Test P-Value : 4.903e-05
##
## Sensitivity : 0.48295
## Specificity : 0.94228
## Pos Pred Value : 0.66406
## Neg Pred Value : 0.88525
## Prevalence : 0.19110
## Detection Rate : 0.09229
## Detection Prevalence : 0.13898
## Balanced Accuracy : 0.71262
##
## 'Positive' Class : 1
##
Sensitivity: kemampuan model dalam memprediksi kelaspositif
Specificity: kemampuan model dalam memprediksi kelasnegatif
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_reglog_1, data_sinta_test, type = "response")
prediksi_data_test <- as.factor(ifelse(prediksi_prob_data_test > 0.5,"1","0"))
eval_reglog_1 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_reglog_1## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 300 43
## 1 17 44
##
## Accuracy : 0.8515
## 95% CI : (0.813, 0.8847)
## No Information Rate : 0.7847
## P-Value [Acc > NIR] : 0.000428
##
## Kappa : 0.5071
##
## Mcnemar's Test P-Value : 0.001249
##
## Sensitivity : 0.5057
## Specificity : 0.9464
## Pos Pred Value : 0.7213
## Neg Pred Value : 0.8746
## Prevalence : 0.2153
## Detection Rate : 0.1089
## Detection Prevalence : 0.1510
## Balanced Accuracy : 0.7261
##
## 'Positive' Class : 1
##
Performa model pada data training dan data testing perlu diperhatikan untuk mengetahui adanya overfiting/underfiting
Overfiting terjadi ketika performa model pada data training jauh lebih tinggi jika dibandingkan dengan performa model pada data testing (mempelajari data terlalu baik)
Underfiting terjadi ketika performa model pada data testing jauh lebih tinggi jika dibandingkan dengan performa model pada data training (tidak mempelajari data dengan baik)
#fungsi utk membentuk plot ROC
rocplot=function(pred,truth, ...){
predob=ROCR::prediction(pred,truth)
perf=ROCR::performance(predob,"tpr","fpr")
auc=ROCR::performance(predob,"auc")@y.values
plot(perf,main = auc)
}NA.OMIT
#df_authors_UI <- na.omit(prediksi_prob_data_train)#ROC data training rocplot(prediksi_prob_data_train,data_sinta_train$y)
data_sinta_test## # A tibble: 609 x 7
## Rumpun_Ilmu Level Akreditasi Jumlah_Dosen_Total Jumlah_Mahasiswa
## <fct> <fct> <fct> <dbl> <dbl>
## 1 Teknik S1 Unggul 16 815
## 2 MIPA S2 B 7 55
## 3 <NA> S1 Unggul 7 225
## 4 Teknik S3 Unggul 7 43
## 5 MIPA S3 Unggul 7 34
## 6 Teknik S2 A 12 403
## 7 MIPA S2 A 6 127
## 8 Teknik S3 Unggul 6 87
## 9 Teknik S3 Unggul 6 87
## 10 Teknik S3 Unggul 6 51
## # ... with 599 more rows, and 2 more variables:
## # Rasio_Dosen_per_Mahasiswa <dbl>, y <fct>
#ROC data testing
rocplot(prediksi_prob_data_test,data_sinta_test$y)
#variable importance
vip(model_reglog_1, num_features = 50)Seleksi Peubah
model_reglog_2 <- glm(y~Level+Jumlah_Mahasiswa , data_sinta, family=binomial())
summary(model_reglog_2)##
## Call:
## glm(formula = y ~ Level + Jumlah_Mahasiswa, family = binomial(),
## data = data_sinta)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9858 -0.7647 -0.6192 -0.1815 2.8854
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.8822017 1.0087099 -3.849 0.000119 ***
## LevelD4 1.1244597 1.4406433 0.781 0.435081
## LevelS2 3.3173996 1.0117302 3.279 0.001042 **
## LevelS3 3.4221448 1.0161952 3.368 0.000758 ***
## LevelProfesi 2.3077538 1.0636234 2.170 0.030029 *
## LevelS1 2.9852008 1.0140145 2.944 0.003241 **
## Jumlah_Mahasiswa -0.0011125 0.0001943 -5.726 1.03e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2189.9 on 2034 degrees of freedom
## Residual deviance: 2044.6 on 2028 degrees of freedom
## AIC: 2058.6
##
## Number of Fisher Scoring iterations: 6
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_reglog_2, data_sinta_train, type = "response")
prediksi_data_train <- as.factor(ifelse(prediksi_prob_data_train > 0.5,"1","0"))
eval_reglog_2_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")## Warning in confusionMatrix.default(prediksi_data_train, data_sinta_train$y, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
eval_reglog_2_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1099 327
## 1 0 0
##
## Accuracy : 0.7707
## 95% CI : (0.748, 0.7923)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : 0.5148
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.7707
## Prevalence : 0.2293
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
rocplot(prediksi_prob_data_train,data_sinta_train$y) # Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_reglog_2, data_sinta_test, type = "response")
prediksi_data_test <- as.factor(ifelse(prediksi_prob_data_test > 0.5,"1","0"))
eval_reglog_2 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")## Warning in confusionMatrix.default(prediksi_data_test, data_sinta_test$y, :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
eval_reglog_2## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 470 139
## 1 0 0
##
## Accuracy : 0.7718
## 95% CI : (0.7363, 0.8045)
## No Information Rate : 0.7718
## P-Value [Acc > NIR] : 0.5227
##
## Kappa : 0
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0000
## Specificity : 1.0000
## Pos Pred Value : NaN
## Neg Pred Value : 0.7718
## Prevalence : 0.2282
## Detection Rate : 0.0000
## Detection Prevalence : 0.0000
## Balanced Accuracy : 0.5000
##
## 'Positive' Class : 1
##
rocplot(prediksi_prob_data_test,data_sinta_test$y)vip(model_reglog_2, num_features = 50)Classification Tree
Model 1 Default
Model dengan hyperparameter
minsplitdancpdefault
model_tree_1 <- rpart(y ~., data = data_sinta_train, method = "class",
control=rpart.control(minsplit = 20, cp=0))
rpart.plot(model_tree_1, extra = 4)# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_1, newdata=data_sinta_train, type = "class")
eval_tree_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_1_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1017 111
## 1 82 216
##
## Accuracy : 0.8647
## 95% CI : (0.8458, 0.882)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6048
##
## Mcnemar's Test P-Value : 0.04385
##
## Sensitivity : 0.6606
## Specificity : 0.9254
## Pos Pred Value : 0.7248
## Neg Pred Value : 0.9016
## Prevalence : 0.2293
## Detection Rate : 0.1515
## Detection Prevalence : 0.2090
## Balanced Accuracy : 0.7930
##
## 'Positive' Class : 1
##
rocplot(prediksi_prob_data_train[,2],data_sinta_train$y) # Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_1, newdata=data_sinta_test, type = "class")
eval_tree_1 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_1## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 424 61
## 1 46 78
##
## Accuracy : 0.8243
## 95% CI : (0.7917, 0.8537)
## No Information Rate : 0.7718
## P-Value [Acc > NIR] : 0.0008942
##
## Kappa : 0.4816
##
## Mcnemar's Test P-Value : 0.1759180
##
## Sensitivity : 0.5612
## Specificity : 0.9021
## Pos Pred Value : 0.6290
## Neg Pred Value : 0.8742
## Prevalence : 0.2282
## Detection Rate : 0.1281
## Detection Prevalence : 0.2036
## Balanced Accuracy : 0.7316
##
## 'Positive' Class : 1
##
rocplot(prediksi_prob_data_test[,2],data_sinta_test$y)vip(model_tree_1, num_features = 50)Model 2
Model dengan hyperparameter
minsplitdancpyang ditentukan sendiri (minsplit=10dancp=0)
model_tree_2 <- rpart(y ~., data = data_sinta_train, method = "class",
control=rpart.control(minsplit = 10, cp=0))
rpart.plot(model_tree_2)# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_2, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_2, newdata=data_sinta_train, type = "class")
eval_tree_2_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_2_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1027 119
## 1 72 208
##
## Accuracy : 0.8661
## 95% CI : (0.8473, 0.8833)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6009
##
## Mcnemar's Test P-Value : 0.0008733
##
## Sensitivity : 0.6361
## Specificity : 0.9345
## Pos Pred Value : 0.7429
## Neg Pred Value : 0.8962
## Prevalence : 0.2293
## Detection Rate : 0.1459
## Detection Prevalence : 0.1964
## Balanced Accuracy : 0.7853
##
## 'Positive' Class : 1
##
ROC_model_tree_2_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_tree_2_train)ROC_model_tree_2_train$AUC## [1] 0.8940725
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_2, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_2, newdata=data_sinta_test, type = "class")
eval_tree_2 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_2## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 440 66
## 1 30 73
##
## Accuracy : 0.8424
## 95% CI : (0.811, 0.8704)
## No Information Rate : 0.7718
## P-Value [Acc > NIR] : 9.855e-06
##
## Kappa : 0.5076
##
## Mcnemar's Test P-Value : 0.000354
##
## Sensitivity : 0.5252
## Specificity : 0.9362
## Pos Pred Value : 0.7087
## Neg Pred Value : 0.8696
## Prevalence : 0.2282
## Detection Rate : 0.1199
## Detection Prevalence : 0.1691
## Balanced Accuracy : 0.7307
##
## 'Positive' Class : 1
##
ROC_model_tree_2 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_tree_2)ROC_model_tree_2$AUC## [1] 0.8301852
vip(model_tree_2, num_features = 50)Model 3 Tuning Minsplit
Model dengan hyperparameter
minsplitoptimum
#mencari minsplit optimum
set.seed(478)
akurasi.semua <- NULL
for(ulangan in 1:100){
acak <- createDataPartition(data_sinta$y, p=0.7, list=FALSE)
data_sinta_train <- data_sinta[acak,]
data_sinta_test <- data_sinta[-acak,]
for (k in 1:30){
pohon <- rpart(y ~ .,
data=data_sinta_train,
method='class',
control=rpart.control(minsplit = k, cp=0))
prediksi.prob <- predict(pohon, data_sinta_test)
prediksi <- ifelse(prediksi.prob > 0.5, "1", "0")[,2]
akurasi <- mean(prediksi == data_sinta_test$y)
akurasi.semua <- rbind(akurasi.semua, c(k, akurasi))
}
}
mean.akurasi <- tapply(akurasi.semua[,2], akurasi.semua[,1], mean)
plot(names(mean.akurasi),mean.akurasi, type="b", xlab="minsplit", ylab="rata-rata akurasi data testing")model_tree_3 <- rpart(y ~., data = data_sinta_train, method = "class",
control=rpart.control(minsplit = 11, cp=0))
rpart.plot(model_tree_3, extra=4)## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_3, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_3, newdata=data_sinta_train, type = "class")
eval_tree_3_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_3_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1033 120
## 1 66 207
##
## Accuracy : 0.8696
## 95% CI : (0.851, 0.8866)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6083
##
## Mcnemar's Test P-Value : 0.0001018
##
## Sensitivity : 0.6330
## Specificity : 0.9399
## Pos Pred Value : 0.7582
## Neg Pred Value : 0.8959
## Prevalence : 0.2293
## Detection Rate : 0.1452
## Detection Prevalence : 0.1914
## Balanced Accuracy : 0.7865
##
## 'Positive' Class : 1
##
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_3, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_3, newdata=data_sinta_test, type = "class")
eval_tree_3 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_3## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 421 63
## 1 49 76
##
## Accuracy : 0.8161
## 95% CI : (0.783, 0.8461)
## No Information Rate : 0.7718
## P-Value [Acc > NIR] : 0.004493
##
## Kappa : 0.4588
##
## Mcnemar's Test P-Value : 0.219303
##
## Sensitivity : 0.5468
## Specificity : 0.8957
## Pos Pred Value : 0.6080
## Neg Pred Value : 0.8698
## Prevalence : 0.2282
## Detection Rate : 0.1248
## Detection Prevalence : 0.2053
## Balanced Accuracy : 0.7213
##
## 'Positive' Class : 1
##
vip(model_tree_3, num_features = 50)Model 4 Opsi CP
Model dengan hyperparameter
cpoptimum
set.seed(478)
model_tree_4 <- rpart(y ~ ., data=data_sinta_train,
method='class',
control=rpart.control(minsplit = 20, cp=0))
printcp(model_tree_4)##
## Classification tree:
## rpart(formula = y ~ ., data = data_sinta_train, method = "class",
## control = rpart.control(minsplit = 20, cp = 0))
##
## Variables actually used in tree construction:
## [1] Akreditasi Jumlah_Dosen_Total
## [3] Jumlah_Mahasiswa Level
## [5] Rasio_Dosen_per_Mahasiswa Rumpun_Ilmu
##
## Root node error: 327/1426 = 0.22931
##
## n= 1426
##
## CP nsplit rel error xerror xstd
## 1 0.0749235 0 1.00000 1.00000 0.048547
## 2 0.0183486 2 0.85015 0.88073 0.046362
## 3 0.0081549 7 0.75841 0.82875 0.045307
## 4 0.0061162 14 0.67890 0.78593 0.044388
## 5 0.0053517 16 0.66667 0.79205 0.044522
## 6 0.0030581 20 0.64526 0.77982 0.044253
## 7 0.0022936 21 0.64220 0.74924 0.043561
## 8 0.0010194 30 0.60245 0.75229 0.043632
## 9 0.0000000 33 0.59939 0.75229 0.043632
model_tree_4 <- rpart(y ~ ., data=data_sinta_train,
method='class',
control=rpart.control(minsplit = 20, cp=0.0066225))
rpart.plot(model_tree_4)# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_tree_4, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_tree_4, newdata=data_sinta_train, type = "class")
eval_tree_4_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_tree_4_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1037 160
## 1 62 167
##
## Accuracy : 0.8443
## 95% CI : (0.8244, 0.8628)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : 3.191e-12
##
## Kappa : 0.5077
##
## Mcnemar's Test P-Value : 7.504e-11
##
## Sensitivity : 0.5107
## Specificity : 0.9436
## Pos Pred Value : 0.7293
## Neg Pred Value : 0.8663
## Prevalence : 0.2293
## Detection Rate : 0.1171
## Detection Prevalence : 0.1606
## Balanced Accuracy : 0.7271
##
## 'Positive' Class : 1
##
ROC_model_tree_4_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_tree_4_train)ROC_model_tree_4_train$AUC## [1] 0.7733511
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_tree_4, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_tree_4, newdata=data_sinta_test, type = "class")
eval_tree_4 <- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_tree_4## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 439 76
## 1 31 63
##
## Accuracy : 0.8243
## 95% CI : (0.7917, 0.8537)
## No Information Rate : 0.7718
## P-Value [Acc > NIR] : 0.0008942
##
## Kappa : 0.4371
##
## Mcnemar's Test P-Value : 2.103e-05
##
## Sensitivity : 0.4532
## Specificity : 0.9340
## Pos Pred Value : 0.6702
## Neg Pred Value : 0.8524
## Prevalence : 0.2282
## Detection Rate : 0.1034
## Detection Prevalence : 0.1544
## Balanced Accuracy : 0.6936
##
## 'Positive' Class : 1
##
ROC_model_tree_4 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_tree_4)ROC_model_tree_4$AUC## [1] 0.7244987
vip(model_tree_4, num_features = 50)Bagging
Model Default
Model dengan hyperparameter
nbaggdefault dantreedefault
model_bag_1 <- ipred::bagging(y ~ ., data=data_sinta_train, coob = TRUE,
nbagg=25,
control= rpart.control(minsplit=2, cp=0))
model_bag_1##
## Bagging classification trees with 25 bootstrap replications
##
## Call: bagging.data.frame(formula = y ~ ., data = data_sinta_train,
## coob = TRUE, nbagg = 25, control = rpart.control(minsplit = 2,
## cp = 0))
##
## Out-of-bag estimate of misclassification error: 0.1198
# Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_bag_1, data_sinta_train, type = "prob")
prediksi_data_train <- predict(model_bag_1, data_sinta_train,type="class")
eval_model_bag_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train$y, positive="1")
eval_model_bag_1_train## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 1056 194
## 1 43 133
##
## Accuracy : 0.8338
## 95% CI : (0.8134, 0.8528)
## No Information Rate : 0.7707
## P-Value [Acc > NIR] : 2.586e-09
##
## Kappa : 0.4388
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.40673
## Specificity : 0.96087
## Pos Pred Value : 0.75568
## Neg Pred Value : 0.84480
## Prevalence : 0.22931
## Detection Rate : 0.09327
## Detection Prevalence : 0.12342
## Balanced Accuracy : 0.68380
##
## 'Positive' Class : 1
##
ROC_model_bag_1_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train$y)
plot(ROC_model_bag_1_train)ROC_model_bag_1_train$AUC## [1] 0.7982667
# Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_bag_1, data_sinta_test, type = "prob")
prediksi_data_test <- predict(model_bag_1, data_sinta_test,type="class")
eval_model_bag_1<- caret::confusionMatrix(prediksi_data_test, data_sinta_test$y, positive="1")
eval_model_bag_1## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 444 89
## 1 26 50
##
## Accuracy : 0.8112
## 95% CI : (0.7778, 0.8415)
## No Information Rate : 0.7718
## P-Value [Acc > NIR] : 0.01047
##
## Kappa : 0.3622
##
## Mcnemar's Test P-Value : 7.402e-09
##
## Sensitivity : 0.3597
## Specificity : 0.9447
## Pos Pred Value : 0.6579
## Neg Pred Value : 0.8330
## Prevalence : 0.2282
## Detection Rate : 0.0821
## Detection Prevalence : 0.1248
## Balanced Accuracy : 0.6522
##
## 'Positive' Class : 1
##
ROC_model_bag_1 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test$y)
plot(ROC_model_bag_1)ROC_model_bag_1$AUC## [1] 0.7415429
Random Forest
Model 1 Default
Model dengan hyperparameter
ntree,mtrydefault
model_rf_1 <- randomForest::randomForest(y ~ ., ntree=500, data=data_sinta_train)
Prediksi pada Data Training
prediksi_prob_data_train <- predict(model_rf_1, data_sinta_train, type = “prob”) prediksi_data_train <- predict(model_rf_1, data_sinta_train,type=“class”) eval_model_rf_1_train <- caret::confusionMatrix(prediksi_data_train, data_sinta_train\(y, positive="1") eval_model_rf_1_train ROC_model_rf_1_train <- rocit(score=prediksi_prob_data_train[,2], class=data_sinta_train\)y) plot(ROC_model_rf_1_train) ROC_model_rf_1_train$AUC
Prediksi pada Data Testing
prediksi_prob_data_test <- predict(model_rf_1, data_sinta_test, type = “prob”) prediksi_data_test <- predict(model_rf_1, data_sinta_test,type=“class”) eval_model_rf_1<- caret::confusionMatrix(prediksi_data_test, data_sinta_test\(y, positive="1") eval_model_rf_1 ROC_model_rf_1 <- rocit(score=prediksi_prob_data_test[,2], class=data_sinta_test\)y) plot(ROC_model_rf_1) ROC_model_rf_1$AUC
vip(model_rf_1, num_features = 50)
Perbandingan Hasil Model
hasil_eval <- rbind( c(eval_reglog_1\(overall[1], eval_reglog_1\)byClass[1], eval_reglog_1\(byClass[2]), c(eval_reglog_2\)overall[1], eval_reglog_2\(byClass[1], eval_reglog_2\)byClass[2]), c(eval_tree_1\(overall[1], eval_tree_1\)byClass[1], eval_tree_1\(byClass[2]), c(eval_tree_2\)overall[1], eval_tree_2\(byClass[1], eval_tree_2\)byClass[2]), c(eval_tree_3\(overall[1], eval_tree_3\)byClass[1], eval_tree_3\(byClass[2]), c(eval_tree_4\)overall[1], eval_tree_4\(byClass[1], eval_tree_4\)byClass[2]), c(eval_model_bag_1\(overall[1], eval_model_bag_1\)byClass[1], eval_model_bag_1\(byClass[2]), c(eval_model_rf_1\)overall[1], eval_model_rf_1\(byClass[1], eval_model_rf_1\)byClass[2])) row.names(hasil_eval) <- c(“RegLog Semua Peubah”,“RegLog Seleksi Peubah”, “ClassTree 1”,“ClassTree 2”,“ClassTree 3”,“ClassTree 4”, “Bagging 1”, “RandomForest 1”) hasil_eval <- as.data.frame(hasil_eval) dplyr::arrange(.data = hasil_eval, desc(Accuracy))