Campus Recruitment Academic and Employability Factors influencing placement source data : https://www.kaggle.com/benroshan/factors-affecting-campus-placement
Suasana Kampus
| Variabel | Keterangan |
|---|---|
| ssc_p | Secondary Education percentage- 10th Grade |
| ssc_b | Board of Education- Central/ Others |
| hsc_p | Higher Secondary Education percentage- 12th Grade |
| hsc_b | Board of Education- Central/ Others |
| hsc_s | Specialization in Higher Secondary Education |
| degree_p | Degree Percentage |
| degree_t | Under Graduation(Degree type)- Field of degree education |
| workex | Work Experience |
| specialisation | Post Graduation(MBA)- Specialization |
| status | Status of placement- Placed/Not placed |
| salary | Salary offered by corporate to candidates |
| mba_p | MBA percentage |
summary(placement_data)
sl_no gender ssc_p ssc_b
Min. : 1.0 Length:215 Min. :40.89 Length:215
1st Qu.: 54.5 Class :character 1st Qu.:60.60 Class :character
Median :108.0 Mode :character Median :67.00 Mode :character
Mean :108.0 Mean :67.30
3rd Qu.:161.5 3rd Qu.:75.70
Max. :215.0 Max. :89.40
hsc_p hsc_b hsc_s degree_p
Min. :37.00 Length:215 Length:215 Min. :50.00
1st Qu.:60.90 Class :character Class :character 1st Qu.:61.00
Median :65.00 Mode :character Mode :character Median :66.00
Mean :66.33 Mean :66.37
3rd Qu.:73.00 3rd Qu.:72.00
Max. :97.70 Max. :91.00
degree_t workex etest_p specialisation mba_p
Length:215 No :141 Min. :50.0 Length:215 Min. :51.21
Class :character Yes: 74 1st Qu.:60.0 Class :character 1st Qu.:57.95
Mode :character Median :71.0 Mode :character Median :62.00
Mean :72.1 Mean :62.28
3rd Qu.:83.5 3rd Qu.:66.25
Max. :98.0 Max. :77.89
status salary
Length:215 Min. :200000
Class :character 1st Qu.:240000
Mode :character Median :265000
Mean :288655
3rd Qu.:300000
Max. :940000
NA's :67
introduce(placement_data)
plot_intro(placement_data)
profile_missing(placement_data)
plot_missing(placement_data, title = "Variabel Data Hilang")
plot_density(placement_data, geom_density_args = list("fill" = "purple", "alpha" = 0.6), ncol = 2L, ggtheme=theme_gray(), title= "Distribusi Peluang Masing-masing Variabel")
placement_data$workex=as.factor(placement_data$workex)
placement_data$gender=as.factor(placement_data$gender)
str(placement_data)
'data.frame': 215 obs. of 15 variables:
$ sl_no : int 1 2 3 4 5 6 7 8 9 10 ...
$ gender : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 1 2 2 2 ...
$ ssc_p : num 67 79.3 65 56 85.8 ...
$ ssc_b : chr "Others" "Central" "Central" "Central" ...
$ hsc_p : num 91 78.3 68 52 73.6 ...
$ hsc_b : chr "Others" "Others" "Central" "Central" ...
$ hsc_s : chr "Commerce" "Science" "Arts" "Science" ...
$ degree_p : num 58 77.5 64 52 73.3 ...
$ degree_t : chr "Sci&Tech" "Sci&Tech" "Comm&Mgmt" "Sci&Tech" ...
$ workex : Factor w/ 2 levels "No","Yes": 1 2 1 1 1 2 1 2 1 1 ...
$ etest_p : num 55 86.5 75 66 96.8 ...
$ specialisation: chr "Mkt&HR" "Mkt&Fin" "Mkt&Fin" "Mkt&HR" ...
$ mba_p : num 58.8 66.3 57.8 59.4 55.5 ...
$ status : chr "Placed" "Placed" "Placed" "Not Placed" ...
$ salary : int 270000 200000 250000 NA 425000 NA NA 252000 231000 NA ...
Pada data di atas tidak memiliki data yang kosong akan tetapi data yang tidak diketahui berjumlah 67. Data yang tidak diketahui tersebut terdapat dalam variabel ‘salary’. Oleh karenanya 67 data tersebut tidak memiliki status.
placement_data %>% group_by(status) %>% count()
placement_data_corr = select(placement_data, ssc_p, hsc_p, degree_p, etest_p, mba_p)
corr <- round(cor(placement_data_corr),2)
ggcorrplot::ggcorrplot(corr, method = "square" , ggtheme = theme_update(), lab = TRUE,
lab_size = 5, type = "lower", colors = c("blue", "white", "purple"),
digits = 2)
placement_data %>%
count(gender, status) %>%
group_by(gender) %>%
mutate(n = n/sum(n) * 100) %>%
ggplot() + aes(gender, n, fill = status, label = paste0(round(n, 2), "%")) +
geom_col() +
geom_text(position=position_stack(0.5))+labs(x="Gender", y="Banyaknya gender", title = "Persentase Status Terhadap Gender")
ggplot(placement_data, aes(x = factor(gender), fill = gender)) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white")+ labs( x = "Gender", y = "Jumlah")+ggtitle("Rasio Variabel gender ")
placement_data %>%
count(hsc_b, status) %>%
group_by(hsc_b) %>%
mutate(n = n/sum(n) * 100) %>%
ggplot() + aes(hsc_b, n, fill = status, label = paste0(round(n, 2), "%")) +
geom_col() +
geom_text(position=position_stack(0.5))+labs(x="Board of Education", y="Banyaknya Board of Education", title = "Persentase Status Terhadap Board of Education ")
ggplot(placement_data, aes(x = factor(hsc_b), fill = hsc_b)) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white")+ labs( x = "Board of Education", y = "Jumlah")+ggtitle("Rasio Variabel Board of Education")
placement_data %>%
count(hsc_s, status) %>%
group_by(hsc_s) %>%
mutate(n = n/sum(n) * 100) %>%
ggplot() + aes(hsc_s, n, fill = status, label = paste0(round(n, 2), "%")) +
geom_col() +
geom_text(position=position_stack(0.5))+labs(x="Specialization in Higher Secondary Education ", y="Banyaknya Specialization in Higher Secondary Education ", title = "Persentase Status terhadap Specialization in Higher Secondary Education")
ggplot(placement_data, aes(x = factor(hsc_s), fill = hsc_s)) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white")+ labs( x = "Specialization in Higher Secondary Education", y = "Jumlah")+ggtitle("Rasio Variabel Specialization in Higher Secondary Education")
placement_data %>%
count(degree_t, status) %>%
group_by(degree_t) %>%
mutate(n = n/sum(n) * 100) %>%
ggplot() + aes(degree_t, n, fill = status, label = paste0(round(n, 2), "%")) +
geom_col() +
geom_text(position=position_stack(0.5))+labs(x="Under Graduation ", y="Banyaknya Under Graduation", title = "Persentase Status terhadap Under Graduation")
ggplot(placement_data, aes(x = factor(degree_t), fill = degree_t)) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white")+ labs( x = "Under Graduation", y = "Jumlah")+ggtitle("Rasio Variabel Under Graduation")
placement_data %>%
count(workex, status) %>%
group_by(workex) %>%
mutate(n = n/sum(n) * 100) %>%
ggplot() + aes(workex, n, fill = status, label = paste0(round(n, 2), "%")) +
geom_col() +
geom_text(position=position_stack(0.5))+labs(x="Work Experience ", y="Banyaknya Work Experience", title = "Persentase Status terhadap Work Experience")
ggplot(placement_data, aes(x = factor(workex), fill = workex)) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white")+ labs( x = "Work Experience", y = "Jumlah")+ggtitle("Rasio Variabel Work Experience")
placement_data %>%
count(specialisation, status) %>%
group_by(specialisation) %>%
mutate(n = n/sum(n) * 100) %>%
ggplot() + aes(specialisation, n, fill = status, label = paste0(round(n, 2), "%")) +
geom_col() +
geom_text(position=position_stack(0.5))+labs(x="Post Graduation(MBA) ", y="Banyaknya Post Graduation(MBA)", title = "Persentase Status terhadap Post Graduation(MBA)")
ggplot(placement_data, aes(x = factor(specialisation), fill = specialisation)) +
geom_bar() +
geom_text(aes(label = ..count..), stat = "count", vjust = 1.5, colour = "white")+ labs( x = "Post Graduation(MBA)", y = "Jumlah")+ggtitle("Rasio variabel Post Graduation(MBA)")
placement_data %>%
mutate(rataan_total = (ssc_p+hsc_p+degree_p+mba_p)/4)%>%
ggplot(aes(rataan_total, fill = status))+
geom_histogram(binwidth = 5, col="black")+
# scale_fill_manual(values = c("#DC3220", "#40B0A6"))+
# scale_colour_manual(values = c("#DC3220", "#40B0A6"))+
labs(x = "Rata-rata nilai 4 sistem pendidikan",
y = "Banyaknya murid",
fill = "Status",
title = "Rata-rata hasil dari 'ssc_p, hsc_p, degree_p, dan mba_p'")
Karena data yang tidak diketahui berjumlah 67 dari total data sebanyak 215, maka data yang bisa diolah berjumlah 148.
Rasio data latihan dengan data prediksi adalah 80:20 data latihan = 173 data prediksi = 42
set.seed(100)
placement_data_olah <- placement_data %>%
select(-salary)%>%
mutate(status = as.factor(make.names(status)))
split <- createDataPartition(placement_data_olah$status,
p =0.8,
list = FALSE)
data_latihan = placement_data_olah[split, ]
data_prediksi = placement_data_olah[-split,]
class_data_latihan <- data_latihan$status
# data_latihan =as.factor(data_latihan)
data_latihan <- data_latihan%>%select(-sl_no,-status)
data_prediksi <- data_prediksi%>%select(-sl_no)
placement_data_c50 <- C5.0(data_latihan, class_data_latihan)
summary(placement_data_c50)
Call:
C5.0.default(x = data_latihan, y = class_data_latihan)
C5.0 [Release 2.07 GPL Edition] Sun Jul 25 09:40:37 2021
-------------------------------
Class specified by attribute `outcome'
Read 173 cases (13 attributes) from undefined.data
Decision tree:
ssc_p <= 56.28: Not.Placed (27/1)
ssc_p > 56.28:
:...workex = Yes: Placed (52/1)
workex = No:
:...hsc_p <= 52: Not.Placed (7)
hsc_p > 52:
:...degree_p > 65: Placed (48/4)
degree_p <= 65:
:...hsc_p > 70.4: Placed (9)
hsc_p <= 70.4:
:...mba_p <= 57.99: Placed (12/2)
mba_p > 57.99:
:...ssc_p <= 70.5: Not.Placed (14/1)
ssc_p > 70.5: Placed (4/1)
Evaluation on training data (173 cases):
Decision Tree
----------------
Size Errors
8 10( 5.8%) <<
(a) (b) <-classified as
---- ----
46 8 (a): class Not.Placed
2 117 (b): class Placed
Attribute usage:
100.00% ssc_p
84.39% workex
54.34% hsc_p
50.29% degree_p
17.34% mba_p
Time: 0.0 secs
plot(placement_data_c50)
dilakukan pengulangan sebanyak 10 kali, hasilnya data menjadi lebih akurat
placement_data_boost <- C5.0(data_latihan, class_data_latihan, trials = 10)
summary(placement_data_boost)
Call:
C5.0.default(x = data_latihan, y = class_data_latihan, trials = 10)
C5.0 [Release 2.07 GPL Edition] Sun Jul 25 09:40:51 2021
-------------------------------
Class specified by attribute `outcome'
Read 173 cases (13 attributes) from undefined.data
----- Trial 0: -----
Decision tree:
ssc_p <= 56.28: Not.Placed (27/1)
ssc_p > 56.28:
:...workex = Yes: Placed (52/1)
workex = No:
:...hsc_p <= 52: Not.Placed (7)
hsc_p > 52:
:...degree_p > 65: Placed (48/4)
degree_p <= 65:
:...hsc_p > 70.4: Placed (9)
hsc_p <= 70.4:
:...mba_p <= 57.99: Placed (12/2)
mba_p > 57.99:
:...ssc_p <= 70.5: Not.Placed (14/1)
ssc_p > 70.5: Placed (4/1)
----- Trial 1: -----
Decision tree:
ssc_p > 77.8: Placed (24.5)
ssc_p <= 77.8:
:...ssc_p <= 52.58: Not.Placed (13.8)
ssc_p > 52.58:
:...workex = Yes: Placed (33.9/5.6)
workex = No:
:...ssc_b = Others: Placed (29.3/8.4)
ssc_b = Central:
:...ssc_p <= 52.6: Placed (4.8)
ssc_p > 52.6:
:...degree_t in {Sci&Tech,Others}: Not.Placed (27/2.3)
degree_t = Comm&Mgmt:
:...ssc_p <= 63.4: Not.Placed (26.7/6.9)
ssc_p > 63.4: Placed (13/1.5)
----- Trial 2: -----
Decision tree:
ssc_p > 77.8: Placed (19.4)
ssc_p <= 77.8:
:...ssc_p <= 52.58: Not.Placed (10.9)
ssc_p > 52.58:
:...hsc_p <= 64.2:
:...hsc_p > 63: Not.Placed (19.3)
: hsc_p <= 63:
: :...hsc_p <= 54: Not.Placed (15.1/0.6)
: hsc_p > 54: Placed (36.6/13.3)
hsc_p > 64.2:
:...hsc_s in {Science,Arts}: Placed (21.9)
hsc_s = Commerce:
:...ssc_p > 67: Placed (11.5)
ssc_p <= 67:
:...degree_p <= 64.5: Not.Placed (13.8/1.8)
degree_p > 64.5: Placed (24.5/8.9)
----- Trial 3: -----
Decision tree:
ssc_p > 77.8: Placed (15.3)
ssc_p <= 77.8:
:...workex = Yes:
:...ssc_p <= 56.6: Not.Placed (4.9)
: ssc_p > 56.6: Placed (26.6/4.7)
workex = No:
:...mba_p > 67.2: Not.Placed (23.9/1)
mba_p <= 67.2:
:...hsc_p > 64.2: Placed (48.2/15.8)
hsc_p <= 64.2:
:...degree_p <= 65: Not.Placed (43.9/5.4)
degree_p > 65: Placed (10.3/3.5)
----- Trial 4: -----
Decision tree:
ssc_p > 77.8: Placed (12.3)
ssc_p <= 77.8:
:...ssc_p <= 56.28: Not.Placed (23.5/2.4)
ssc_p > 56.28:
:...hsc_p > 78.5: Placed (10.6)
hsc_p <= 78.5:
:...mba_p > 58.23: Not.Placed (92.2/27.9)
mba_p <= 58.23:
:...mba_p <= 52.21: Not.Placed (4.4)
mba_p > 52.21: Placed (29.9/2.8)
----- Trial 5: -----
Decision tree:
workex = Yes: Placed (41.2/8.1)
workex = No:
:...hsc_p <= 52: Not.Placed (9.5)
hsc_p > 52:
:...ssc_b = Others: Placed (43.2/13.8)
ssc_b = Central:
:...mba_p > 66.06: Not.Placed (24.3/3.6)
mba_p <= 66.06:
:...gender = F: Placed (15.1/2.5)
gender = M:
:...degree_p <= 63.35: Not.Placed (16/1.5)
degree_p > 63.35: Placed (23.6/7.4)
----- Trial 6: -----
Decision tree:
ssc_p > 64:
:...hsc_p > 64: Placed (34.2/1.4)
: hsc_p <= 64:
: :...hsc_p <= 63: Placed (33.4/11)
: hsc_p > 63: Not.Placed (11.5/0.3)
ssc_p <= 64:
:...hsc_p <= 59: Not.Placed (18.7)
hsc_p > 59:
:...degree_p > 73.43: Not.Placed (9.8)
degree_p <= 73.43:
:...degree_p <= 65: Not.Placed (49.2/14.3)
degree_p > 65: Placed (16.3/2.6)
----- Trial 7: -----
Decision tree:
ssc_p <= 56.28: Not.Placed (25.6/1.9)
ssc_p > 56.28:
:...workex = Yes: Placed (35/4.7)
workex = No:
:...hsc_p > 70.2: Placed (21.9/2.5)
hsc_p <= 70.2:
:...mba_p <= 57.99: Placed (24.9/4.9)
mba_p > 57.99: Not.Placed (65.7/17.1)
----- Trial 8: -----
Decision tree:
ssc_p <= 56.28: Not.Placed (22/3)
ssc_p > 56.28:
:...ssc_p > 77.8: Placed (13.4)
ssc_p <= 77.8:
:...mba_p > 68.81: Not.Placed (19.5/3.1)
mba_p <= 68.81:
:...workex = Yes: Placed (17.8)
workex = No:
:...hsc_p <= 54: Not.Placed (8)
hsc_p > 54:
:...hsc_p > 70.2: Placed (13.1)
hsc_p <= 70.2:
:...hsc_p > 69.4: Not.Placed (8.1/0.5)
hsc_p <= 69.4:
:...degree_t = Sci&Tech: Not.Placed (27.5/11.4)
degree_t in {Comm&Mgmt,Others}: Placed (43.5/8.1)
----- Trial 9: -----
Decision tree:
degree_p > 65:
:...ssc_b = Others: Placed (30.6/1)
: ssc_b = Central:
: :...workex = Yes: Placed (12.5)
: workex = No:
: :...etest_p <= 74.4: Placed (17.6/1.5)
: etest_p > 74.4: Not.Placed (23.3/7.5)
degree_p <= 65:
:...ssc_p <= 56.6: Not.Placed (11.6)
ssc_p > 56.6:
:...hsc_p > 70.4: Placed (8.7)
hsc_p <= 70.4:
:...mba_p > 67.69: Not.Placed (10.3)
mba_p <= 67.69:
:...workex = Yes: Placed (7.8)
workex = No:
:...mba_p > 66.06: Placed (6.4)
mba_p <= 66.06:
:...ssc_p > 68: Placed (4.7)
ssc_p <= 68:
:...mba_p <= 57.99: Placed (16.6/6.3)
mba_p > 57.99: Not.Placed (23)
Evaluation on training data (173 cases):
Trial Decision Tree
----- ----------------
Size Errors
0 8 10( 5.8%)
1 8 27(15.6%)
2 9 16( 9.2%)
3 7 20(11.6%)
4 6 49(28.3%)
5 7 27(15.6%)
6 7 21(12.1%)
7 5 27(15.6%)
8 9 19(11.0%)
9 12 14( 8.1%)
boost 0( 0.0%) <<
(a) (b) <-classified as
---- ----
54 (a): class Not.Placed
119 (b): class Placed
Attribute usage:
100.00% ssc_p
100.00% hsc_p
100.00% degree_p
100.00% workex
83.24% ssc_b
83.24% mba_p
39.88% degree_t
38.15% hsc_s
25.43% gender
15.61% etest_p
Time: 0.0 secs
plot(placement_data_boost)
placement_data_tree = tree(status~., placement_data_olah[-1])
summary(placement_data_tree)
Classification tree:
tree(formula = status ~ ., data = placement_data_olah[-1])
Variables actually used in tree construction:
[1] "ssc_p" "hsc_p" "mba_p" "workex" "degree_p"
Number of terminal nodes: 13
Residual mean deviance: 0.3372 = 68.12 / 202
Misclassification error rate: 0.07442 = 16 / 215
plot(placement_data_tree)
text(placement_data_tree, pretty = 0)
prediksi <- predict(placement_data_c50, data_prediksi)
# summary(prediksi)
cm_c50=confusionMatrix(prediksi, data_prediksi$status)
cm= draw_confusion_matrix(cm_c50)
prediksi_boost <- predict(placement_data_boost, data_prediksi)
# summary(prediksi_boost)
cm_boost=confusionMatrix(prediksi_boost, data_prediksi$status)
c= draw_confusion_matrix(cm_boost)
data_prediksi = placement_data_olah[-split,]
p=predict(placement_data_tree, data_prediksi, type = 'class')
cm_tree=confusionMatrix(p, data_prediksi$status, positive="Placed")
draw_confusion_matrix(cm_tree)
akurasi_c50 = as.data.frame(cm_c50$overall)[1,]
akurasi_boost = as.data.frame(cm_boost$overall)[1,]
akurasi_tree = as.data.frame(cm_tree$overall)[1,]
kappa_c50 = as.data.frame(cm_c50$overall)[2,]
kappa_boost = as.data.frame(cm_boost$overall)[2,]
kappa_tree = as.data.frame(cm_tree$overall)[2,]
nama = c("c50", "boost", "tree")
akurasi = c(akurasi_c50, akurasi_boost, akurasi_tree)
kappa = c(kappa_c50, kappa_boost, kappa_tree)
# hasil <- table(df$row_variable, df$column_variable)
data.frame(nama, akurasi, kappa)
Karena berdasarkan metode klasifikasi di atas penggunaan metode tree menunjukkan hasil terbaik, maka keputusan model yang dapat diambil adalah metode tree.
Sehingga, faktor yang mempengaruhi pertama kali untuk mendapatkan tempat adalah Persentase pendidikan menengah - kelas 10 (ssc_p)