Library yang digunakan
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.7
## v tidyr 1.1.4 v stringr 1.4.0
## v readr 2.1.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
Import dataset Titanic
Titanic<-read.csv("train.csv")
Explaratory Data
glimpse(Titanic)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,~
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1~
## $ Pclass <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3~
## $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl~
## $ Sex <chr> "male", "female", "female", "female", "male", "male", "mal~
## $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, ~
## $ SibSp <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0~
## $ Parch <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0~
## $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37~
## $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,~
## $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C~
## $ Embarked <chr> "S", "C", "S", "S", "S", "Q", "S", "S", "S", "C", "S", "S"~
Data Manipulation
Pada data manipulation dilakukan: -membuang data-data yang bernilai NA, -membuang juga kolom yang informasi nya tidak dibutuhkan -mengubah tipe data yang tidak sesuai. Pada data yang masih bertipe karakter di ubah menjadi factor dan dibuat menjadi data dummy(0 dan 1). 1 pada data sex==“male” dan 0 pada sex==“female”
Titanic_clean <- Titanic %>%
na.omit()%>%
mutate(sex_male = ifelse(Sex=="male", 1, 0)) %>%
select(-c(PassengerId, Name, Ticket, Cabin, Embarked, Sex))
head(Titanic_clean)
## Survived Pclass Age SibSp Parch Fare sex_male
## 1 0 3 22 1 0 7.2500 1
## 2 1 1 38 1 0 71.2833 0
## 3 1 3 26 0 0 7.9250 0
## 4 1 1 35 1 0 53.1000 0
## 5 0 3 35 0 0 8.0500 1
## 7 0 1 54 0 0 51.8625 1
Pre-Processing Data Cek Proposisi Cek proposisi set data target yang akan di prediksi. Disini data yang menjadi target adalah kolom Survived.
prop.table(table(Titanic_clean$Survived))
##
## 0 1
## 0.5938375 0.4061625
Proposisi sudah sebanding antar yang selamat dengan yang tidak. Sehingga tidak perlu membuang data agar seimbang.
Membuat data tes dan data train / Split dataset agar untuk tujuan evaluasi model.
set.seed(123)
row_data <- nrow(Titanic_clean)
index <- sample(row_data, row_data*0.8)
data_train <- Titanic_clean[ index, ]
data_test <- Titanic_clean[ -index, ]
Modelling
Logistic_model <- glm(Survived ~., data = data_train, family = "binomial")
summary(Logistic_model)
##
## Call:
## glm(formula = Survived ~ ., family = "binomial", data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6588 -0.6839 -0.4044 0.6625 2.3808
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.904877 0.638415 7.683 1.56e-14 ***
## Pclass -1.142553 0.174464 -6.549 5.80e-11 ***
## Age -0.038864 0.008769 -4.432 9.33e-06 ***
## SibSp -0.340986 0.137764 -2.475 0.0133 *
## Parch -0.078780 0.129753 -0.607 0.5437
## Fare 0.002104 0.002515 0.837 0.4027
## sex_male -2.557370 0.239531 -10.677 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 766.59 on 570 degrees of freedom
## Residual deviance: 521.49 on 564 degrees of freedom
## AIC: 535.49
##
## Number of Fisher Scoring iterations: 5
Prediksi
logistic_pred <- predict(Logistic_model, newdata = data_test, type = "response")
head(logistic_pred)
## 1 3 10 21 25 35
## 0.09441458 0.61855887 0.85790210 0.22389384 0.52720288 0.48717928
Mengubah peluang menjadi yes dan no Dari peluang yang dihasilkan kita bisa mendapatkan label atau kategori dari setiap prediksi. Umumnya kita menggunakan threshold jika peluang > 0.5 maka dianggap yes dan jika dibawah 0.5 maka dianggap no. Setelah diubah, kolom diubah menjadi jenis faktor.
log_pred_class <- ifelse(logistic_pred > 0.5, "Yes", "No") %>%
as.factor()
head(log_pred_class)
## 1 3 10 21 25 35
## No Yes Yes No Yes No
## Levels: No Yes
Model Evaluation
df_pred <- data.frame(pred = logistic_pred)
ggplot(df_pred, aes(x=pred)) +
geom_density(lwd=0.5) +
labs(title = "Distribution of Probability Prediction Data") +
theme_minimal()
Uji overfit / underfit
Predict data train
Log_pred_train <- predict(Logistic_model, newdata = data_train, type = "response")
Log_pred_class_train <- ifelse(Log_pred_train > 0.5, "Yes", "No") %>%
as.factor()
#Kembalikan variabel target pada data test dan data train menjadi "Yes" dan "No".
data_test <- data_test %>%
mutate(Survived = as.factor(ifelse(Survived==1, "Yes", "No")))
data_train <- data_train %>%
mutate(Survived = as.factor(ifelse(Survived==1, "Yes", "No")))
Log_reg_mod_test <- confusionMatrix(log_pred_class, as.factor(data_test$Survived), positive = "Yes")
Log_reg_mod_train <- confusionMatrix(Log_pred_class_train, as.factor(data_train$Survived), positive = "Yes")
Hasil regresi model data test
Log_reg_mod_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 74 18
## Yes 5 46
##
## Accuracy : 0.8392
## 95% CI : (0.7685, 0.8952)
## No Information Rate : 0.5524
## P-Value [Acc > NIR] : 3.21e-13
##
## Kappa : 0.6683
##
## Mcnemar's Test P-Value : 0.01234
##
## Sensitivity : 0.7188
## Specificity : 0.9367
## Pos Pred Value : 0.9020
## Neg Pred Value : 0.8043
## Prevalence : 0.4476
## Detection Rate : 0.3217
## Detection Prevalence : 0.3566
## Balanced Accuracy : 0.8277
##
## 'Positive' Class : Yes
##
Hasil regresi model data train
Log_reg_mod_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 297 69
## Yes 48 157
##
## Accuracy : 0.7951
## 95% CI : (0.7596, 0.8275)
## No Information Rate : 0.6042
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.5646
##
## Mcnemar's Test P-Value : 0.06446
##
## Sensitivity : 0.6947
## Specificity : 0.8609
## Pos Pred Value : 0.7659
## Neg Pred Value : 0.8115
## Prevalence : 0.3958
## Detection Rate : 0.2750
## Detection Prevalence : 0.3590
## Balanced Accuracy : 0.7778
##
## 'Positive' Class : Yes
##
Model dinyatakan Tidak Underfit maupun Overfit karena hasil prediksi model pada data train dan data test, akurasinya tidak lebih maupun kurang dari 10%. Meskipun prediksi pada data test lebih baik dan mengarah ke underfit, namun selisih nya tidak terlalu banyak.
Menggunakan Step untuk memperbaiki model Perbaiki model dengan mengurangi kolom yang dianggap tidak berpengaruh.
log_model_new <- step(Logistic_model, trace = 0)
log_pred_new <- predict(log_model_new, newdata = data_test, type = "response")
log_pred_new_class <- ifelse(log_pred_new > 0.5, "Yes", "No") %>%
as.factor()
log_new <- confusionMatrix(log_pred_new_class, as.factor(data_test$Survived), positive = "Yes")
Hasil model knn yang baru
log_new
## Confusion Matrix and Statistics
##
## Reference
## Prediction No Yes
## No 73 17
## Yes 6 47
##
## Accuracy : 0.8392
## 95% CI : (0.7685, 0.8952)
## No Information Rate : 0.5524
## P-Value [Acc > NIR] : 3.21e-13
##
## Kappa : 0.6693
##
## Mcnemar's Test P-Value : 0.03706
##
## Sensitivity : 0.7344
## Specificity : 0.9241
## Pos Pred Value : 0.8868
## Neg Pred Value : 0.8111
## Prevalence : 0.4476
## Detection Rate : 0.3287
## Detection Prevalence : 0.3706
## Balanced Accuracy : 0.8292
##
## 'Positive' Class : Yes
##
Pre-Processing Data Membuat data tes dan data train / Split dataset agar untuk tujuan evaluasi model.
set.seed(123)
row_data <- nrow(Titanic_clean)
index <- sample(row_data, row_data*0.8)
data_train <- Titanic_clean[ index, ]
data_test <- Titanic_clean[ -index, ]
Scaling Pada kolom Pclass, SibSp, Parch, Age dan Fare nilai nya besar dan tidak seragam dimana kolom yang lain memiliki skala hanya dari 0 - 1. Sehingga perlu dibuat menjadi seragam.
Scaling Data Train
# Scaling Data Train
scaled_train <- data_train %>%
select(-Survived) %>%
scale()
# Menyimpan target variabel train
target_train <- data_train$Survived
Scaling Data Test
# Scaling Data Test
scaled_test <- data_test %>%
select(-Survived) %>%
scale(center = attr(scaled_train, "scaled:center"),
scale = attr(scaled_train, "scaled:scale")
)
# Menyimpan target variabel test
target_test <- data_test$Survived
Modelling Menentukan nilai K menggunakan akar kuadrat dari data train
k_choose <- sqrt(nrow(scaled_train)) %>% round()
k_choose
## [1] 24
knn_model <- knn3(x = scaled_train,
y = as.factor(target_train),
k = k_choose)
Prediksi
knn_pred <- predict(knn_model, newdata = scaled_test, type = "class")
head(knn_pred)
## [1] 0 1 1 0 0 0
## Levels: 0 1
Model Evaluation
Uji overfit / underfit
Predict data train
knn_pred_train <- predict(knn_model, newdata = scaled_train, type = "class")
knn_mod_test <- confusionMatrix(knn_pred, as.factor(data_test$Survived), positive = "1")
knn_mod_train <- confusionMatrix(knn_pred_train, as.factor(data_train$Survived), positive = "1")
Hasil regresi model data train
knn_mod_train
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 302 58
## 1 43 168
##
## Accuracy : 0.8231
## 95% CI : (0.7893, 0.8536)
## No Information Rate : 0.6042
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6259
##
## Mcnemar's Test P-Value : 0.1636
##
## Sensitivity : 0.7434
## Specificity : 0.8754
## Pos Pred Value : 0.7962
## Neg Pred Value : 0.8389
## Prevalence : 0.3958
## Detection Rate : 0.2942
## Detection Prevalence : 0.3695
## Balanced Accuracy : 0.8094
##
## 'Positive' Class : 1
##
Hasil regresi model data test
knn_mod_test
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 75 19
## 1 4 45
##
## Accuracy : 0.8392
## 95% CI : (0.7685, 0.8952)
## No Information Rate : 0.5524
## P-Value [Acc > NIR] : 3.21e-13
##
## Kappa : 0.6673
##
## Mcnemar's Test P-Value : 0.003509
##
## Sensitivity : 0.7031
## Specificity : 0.9494
## Pos Pred Value : 0.9184
## Neg Pred Value : 0.7979
## Prevalence : 0.4476
## Detection Rate : 0.3147
## Detection Prevalence : 0.3427
## Balanced Accuracy : 0.8262
##
## 'Positive' Class : 1
##
Model dinyatakan Tidak Underfit maupun Overfit karena hasil prediksi model pada data train dan data test, akurasinya tidak lebih maupun kurang dari 10%.
Mengubah nilai K Perbaiki model dengan mengubah nilai k=16
knn_model_new <- knn3(x = scaled_train,
y = as.factor(target_train),
k = 16)
knn_pred_new <- predict(knn_model_new, newdata = scaled_test, type = "class")
knn_new <- confusionMatrix(knn_pred_new, as.factor(data_test$Survived), positive = "1")
Hasil model knn yang baru
knn_new
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 77 17
## 1 2 47
##
## Accuracy : 0.8671
## 95% CI : (0.8003, 0.9181)
## No Information Rate : 0.5524
## P-Value [Acc > NIR] : 6.638e-16
##
## Kappa : 0.7252
##
## Mcnemar's Test P-Value : 0.001319
##
## Sensitivity : 0.7344
## Specificity : 0.9747
## Pos Pred Value : 0.9592
## Neg Pred Value : 0.8191
## Prevalence : 0.4476
## Detection Rate : 0.3287
## Detection Prevalence : 0.3427
## Balanced Accuracy : 0.8545
##
## 'Positive' Class : 1
##
Logistics Regression Model
eval_log <- data_frame(Model = "Logistic Regression",
Accuracy = log_new$overall[1],
Recall = log_new$byClass[1],
Specificity = log_new$byClass[2],
Precision = log_new$byClass[3])
## Warning: `data_frame()` was deprecated in tibble 1.1.0.
## Please use `tibble()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
K-Nearest Neighbour Model
eval_knn <- data_frame(Model = "K-Nearest Neighbour",
Accuracy = knn_new$overall[1],
Recall = knn_new$byClass[1],
Specificity = knn_new$byClass[2],
Precision = knn_new$byClass[3])
Compare Models
eval_log %>% full_join(eval_knn)
## Joining, by = c("Model", "Accuracy", "Recall", "Specificity", "Precision")
## # A tibble: 2 x 5
## Model Accuracy Recall Specificity Precision
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 Logistic Regression 0.839 0.734 0.924 0.887
## 2 K-Nearest Neighbour 0.867 0.734 0.975 0.959
Berdasarkan 2 model di atas, dapat disimpulkan bahwa Model K-Nearest Neighbour memiliki hasil yang lebih baik.
Dari dataset Titanic, dibandingkan dengan menggunakan model Logistics Regression model K-Nearest Neighbour lebih baik dalam memprediksi penumpang selamat dibanding dengan model Logistic Regression.