library(dplyr)
library(gtools)
library(caret)
library(class)titanic <- read.csv("train.csv")
str(titanic)#> 'data.frame': 891 obs. of 12 variables:
#> $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
#> $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
#> $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
#> $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
#> $ Sex : chr "male" "female" "female" "female" ...
#> $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
#> $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
#> $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
#> $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#> $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
#> $ Cabin : chr "" "C85" "" "C123" ...
#> $ Embarked : chr "S" "C" "S" "S" ...
Keterangan Data
head(titanic)#> PassengerId Survived Pclass
#> 1 1 0 3
#> 2 2 1 1
#> 3 3 1 3
#> 4 4 1 1
#> 5 5 0 3
#> 6 6 0 3
#> Name Sex Age SibSp Parch
#> 1 Braund, Mr. Owen Harris male 22 1 0
#> 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
#> 3 Heikkinen, Miss. Laina female 26 0 0
#> 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
#> 5 Allen, Mr. William Henry male 35 0 0
#> 6 Moran, Mr. James male NA 0 0
#> Ticket Fare Cabin Embarked
#> 1 A/5 21171 7.2500 S
#> 2 PC 17599 71.2833 C85 C
#> 3 STON/O2. 3101282 7.9250 S
#> 4 113803 53.1000 C123 S
#> 5 373450 8.0500 S
#> 6 330877 8.4583 Q
titanic <- titanic %>%
mutate(Survived = as.factor(Survived),
Sex = as.factor(Sex),
Pclass = as.factor(Pclass),
Embarked = as.factor(Embarked),
SibSp = as.factor(SibSp),
Parch = as.factor(Parch))
glimpse(titanic)#> Rows: 891
#> Columns: 12
#> $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
#> $ Survived <fct> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
#> $ Pclass <fct> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
#> $ Name <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
#> $ Sex <fct> male, female, female, female, male, male, male, male, fema…
#> $ Age <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
#> $ SibSp <fct> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
#> $ Parch <fct> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
#> $ Ticket <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
#> $ Fare <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
#> $ Cabin <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
#> $ Embarked <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S, C…
Selanjutnya yaitu melakukan pengecekan terhadap missing value. Missing value perlu kita cek terlebih dahulu
colSums(is.na(titanic))#> PassengerId Survived Pclass Name Sex Age
#> 0 0 0 0 0 177
#> SibSp Parch Ticket Fare Cabin Embarked
#> 0 0 0 0 0 0
titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm = T)
colSums(is.na(titanic))#> PassengerId Survived Pclass Name Sex Age
#> 0 0 0 0 0 0
#> SibSp Parch Ticket Fare Cabin Embarked
#> 0 0 0 0 0 0
prop.table(table(titanic$Survived))#>
#> 0 1
#> 0.6161616 0.3838384
table(titanic$Survived)#>
#> 0 1
#> 549 342
Jika dilihat dari proporsi kedua kelas, sudah cukup seimbang, sehingga kita tidak terlalu membutuhkan pre-processing tambahan untuk menyeimbangkan proporsi antar dua kelas target variabel.
RNGkind(sample.kind = "Rounding")
set.seed(100)
index <- sample(nrow(titanic), nrow(titanic)*0.7)
titanic_train <- titanic[index,]
titanic_test <- titanic[-index,]model <- glm(formula = Survived~Age+Sex+Fare+Cabin+Embarked, family = "binomial", data = titanic_train)
summary(model)#>
#> Call:
#> glm(formula = Survived ~ Age + Sex + Fare + Cabin + Embarked,
#> family = "binomial", data = titanic_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -1.7715 -0.5810 -0.4457 0.6425 2.4236
#>
#> Coefficients: (1 not defined because of singularities)
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 1.006414 0.344985 2.917 0.00353 **
#> Age -0.028168 0.010190 -2.764 0.00571 **
#> Sexmale -2.242630 0.231418 -9.691 < 0.0000000000000002 ***
#> Fare 0.009544 0.004766 2.003 0.04522 *
#> CabinA10 -17.136000 6522.638610 -0.003 0.99790
#> CabinA16 18.096529 6522.638607 0.003 0.99779
#> CabinA19 -16.741428 6522.638605 -0.003 0.99795
#> CabinA20 20.201945 6522.638623 0.003 0.99753
#> CabinA23 21.769393 6522.638619 0.003 0.99734
#> CabinA24 -16.938562 6522.638607 -0.003 0.99793
#> CabinA31 20.195892 6522.638604 0.003 0.99753
#> CabinA32 -16.970473 6522.638607 -0.003 0.99792
#> CabinA34 19.133740 6522.638610 0.003 0.99766
#> CabinA6 20.252185 6522.638613 0.003 0.99752
#> CabinB101 15.461481 6522.639034 0.002 0.99811
#> CabinB102 -16.493297 6522.638605 -0.003 0.99798
#> CabinB19 -15.931332 6522.638614 -0.002 0.99805
#> CabinB20 19.126186 3811.622270 0.005 0.99600
#> CabinB22 17.896101 6522.638617 0.003 0.99781
#> CabinB28 18.542569 6522.638612 0.003 0.99773
#> CabinB3 16.753962 6522.638661 0.003 0.99795
#> CabinB30 -16.527704 6522.638623 -0.003 0.99798
#> CabinB35 17.137062 6522.638606 0.003 0.99790
#> CabinB38 -16.315687 6522.638607 -0.003 0.99800
#> CabinB42 17.808535 6522.638598 0.003 0.99782
#> CabinB49 16.788374 6522.638625 0.003 0.99795
#> CabinB5 16.173688 4593.637659 0.004 0.99719
#> CabinB50 19.975322 6522.638605 0.003 0.99756
#> CabinB51 B53 B55 -0.479194 3.012873 -0.159 0.87363
#> CabinB58 B60 -1.642364 1.871106 -0.878 0.38008
#> CabinB73 17.512366 6522.638607 0.003 0.99786
#> CabinB77 17.663674 6522.638605 0.003 0.99784
#> CabinB78 16.560635 6522.638630 0.003 0.99797
#> CabinB80 17.357805 6522.638636 0.003 0.99788
#> CabinB82 B84 -17.227237 6522.638618 -0.003 0.99789
#> CabinB94 -16.203145 6522.638607 -0.002 0.99802
#> CabinB96 B98 18.467029 2774.557465 0.007 0.99469
#> CabinC101 18.561249 6522.638619 0.003 0.99773
#> CabinC103 18.939999 6522.638605 0.003 0.99768
#> CabinC104 20.975927 6522.638615 0.003 0.99743
#> CabinC106 20.347763 6522.638598 0.003 0.99751
#> CabinC123 -16.794409 6522.638608 -0.003 0.99795
#> CabinC124 -16.746677 6522.638605 -0.003 0.99795
#> CabinC125 17.728808 6522.638633 0.003 0.99783
#> CabinC126 17.899947 6522.638599 0.003 0.99781
#> CabinC128 -16.827320 6522.638605 -0.003 0.99794
#> CabinC148 19.811088 6522.638602 0.003 0.99758
#> CabinC22 C26 -2.295112 1.476319 -1.555 0.12004
#> CabinC23 C25 C27 -1.537441 1.704236 -0.902 0.36699
#> CabinC45 16.021387 6522.638675 0.002 0.99804
#> CabinC47 19.918147 6522.638604 0.003 0.99756
#> CabinC52 20.337599 6522.638600 0.003 0.99751
#> CabinC65 -0.868704 1.722350 -0.504 0.61400
#> CabinC68 -17.445104 6522.638626 -0.003 0.99787
#> CabinC7 16.859445 6522.638635 0.003 0.99794
#> CabinC70 18.785668 6522.638632 0.003 0.99770
#> CabinC78 -0.246617 1.787523 -0.138 0.89027
#> CabinC82 -19.025029 6522.638672 -0.003 0.99767
#> CabinC83 17.748878 6522.638605 0.003 0.99783
#> CabinC85 17.512482 6522.638607 0.003 0.99786
#> CabinC86 -17.374388 6522.638625 -0.003 0.99787
#> CabinC90 17.325984 6522.638603 0.003 0.99788
#> CabinC91 -17.724052 6522.638637 -0.003 0.99783
#> CabinC92 18.855636 3764.985305 0.005 0.99600
#> CabinC95 -18.609850 6522.638677 -0.003 0.99772
#> CabinD 18.089031 4531.840532 0.004 0.99682
#> CabinD10 D12 19.408230 6522.638617 0.003 0.99763
#> CabinD11 18.252210 6522.638622 0.003 0.99777
#> CabinD15 17.295678 6522.638622 0.003 0.99788
#> CabinD17 18.678389 4612.106401 0.004 0.99677
#> CabinD19 20.483775 6522.638613 0.003 0.99749
#> CabinD21 17.901259 6522.638599 0.003 0.99781
#> CabinD26 -17.072076 4513.176240 -0.004 0.99698
#> CabinD28 17.634323 6522.638599 0.003 0.99784
#> CabinD30 -17.301427 6522.638608 -0.003 0.99788
#> CabinD35 20.342937 6522.638600 0.003 0.99751
#> CabinD36 16.689218 6522.638618 0.003 0.99796
#> CabinD37 18.094314 6522.638615 0.003 0.99779
#> CabinD46 -16.330650 6522.638608 -0.003 0.99800
#> CabinD47 17.844005 6522.638601 0.003 0.99782
#> CabinD48 -17.214420 6522.638630 -0.003 0.99789
#> CabinD56 20.635920 6522.638610 0.003 0.99748
#> CabinD6 -16.799295 6522.638605 -0.003 0.99795
#> CabinD7 18.590222 6522.638627 0.003 0.99773
#> CabinE101 18.286738 4610.094498 0.004 0.99684
#> CabinE12 20.900953 6522.638613 0.003 0.99744
#> CabinE121 19.852236 6522.638601 0.003 0.99757
#> CabinE17 20.985456 6522.638617 0.003 0.99743
#> CabinE24 20.638689 4607.528613 0.004 0.99643
#> CabinE25 20.564969 4612.201890 0.004 0.99644
#> CabinE31 -16.617964 6522.638610 -0.003 0.99797
#> CabinE33 17.766299 4606.551251 0.004 0.99692
#> CabinE34 16.965508 6522.638623 0.003 0.99792
#> CabinE36 17.424129 6522.638617 0.003 0.99787
#> CabinE40 16.993675 6522.638639 0.003 0.99792
#> CabinE44 18.124711 6522.638616 0.003 0.99778
#> CabinE58 -16.250166 6522.638608 -0.002 0.99801
#> CabinE67 0.636389 1.762514 0.361 0.71805
#> CabinE68 17.306532 6522.638607 0.003 0.99788
#> CabinE77 -18.067132 6522.638613 -0.003 0.99779
#> CabinF E69 17.745582 6522.638603 0.003 0.99783
#> CabinF G63 -16.219818 6522.638607 -0.002 0.99802
#> CabinF G73 -16.867674 6522.638605 -0.003 0.99794
#> CabinF2 1.544397 1.466622 1.053 0.29233
#> CabinF33 18.195954 4608.987285 0.004 0.99685
#> CabinF38 -17.154317 6522.638613 -0.003 0.99790
#> CabinF4 18.572375 3965.646422 0.005 0.99626
#> CabinG6 -0.720965 1.039707 -0.693 0.48804
#> EmbarkedC 0.437251 0.312132 1.401 0.16126
#> EmbarkedQ 0.587058 0.369221 1.590 0.11184
#> EmbarkedS NA NA NA NA
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 834.17 on 622 degrees of freedom
#> Residual deviance: 490.73 on 513 degrees of freedom
#> AIC: 710.73
#>
#> Number of Fisher Scoring iterations: 17
Pada pemodelan yang pertama, masih banyak variabel prediktor yang tidak signifikan terhadap target variabel, oleh karena itu kita akan coba melakukan model fitting menggunakan metode stepwise.
library(MASS)
model2 <- stepAIC(model, direction = "backward")#> Start: AIC=710.73
#> Survived ~ Age + Sex + Fare + Cabin + Embarked
#>
#> Df Deviance AIC
#> - Cabin 103 637.17 651.17
#> - Embarked 2 494.61 710.61
#> <none> 490.73 710.73
#> - Fare 1 496.19 714.19
#> - Age 1 498.75 716.75
#> - Sex 1 594.49 812.49
#>
#> Step: AIC=651.17
#> Survived ~ Age + Sex + Fare + Embarked
#>
#> Df Deviance AIC
#> - Embarked 3 641.95 649.95
#> <none> 637.17 651.17
#> - Age 1 639.90 651.90
#> - Fare 1 652.54 664.54
#> - Sex 1 778.05 790.05
#>
#> Step: AIC=649.95
#> Survived ~ Age + Sex + Fare
#>
#> Df Deviance AIC
#> <none> 641.95 649.95
#> - Age 1 644.57 650.57
#> - Fare 1 661.57 667.57
#> - Sex 1 789.18 795.18
Dengan menggunakan metode backward pada stepwise, kita memperoleh model sebagai berikut.
summary(model2)#>
#> Call:
#> glm(formula = Survived ~ Age + Sex + Fare, family = "binomial",
#> data = titanic_train)
#>
#> Deviance Residuals:
#> Min 1Q Median 3Q Max
#> -2.1999 -0.6555 -0.6073 0.8285 2.0577
#>
#> Coefficients:
#> Estimate Std. Error z value Pr(>|z|)
#> (Intercept) 1.026064 0.271342 3.781 0.000156 ***
#> Age -0.012556 0.007816 -1.606 0.108193
#> Sexmale -2.272862 0.199458 -11.395 < 0.0000000000000002 ***
#> Fare 0.008747 0.002291 3.817 0.000135 ***
#> ---
#> Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#>
#> (Dispersion parameter for binomial family taken to be 1)
#>
#> Null deviance: 834.17 on 622 degrees of freedom
#> Residual deviance: 641.95 on 619 degrees of freedom
#> AIC: 649.95
#>
#> Number of Fisher Scoring iterations: 4
Dengan menggunakan model2 hasil dari stepwise, kita akan coba prediksi menggunakan data test yang sudah kita miliki.
titanic_test$prob_surv <- predict(model2, type = "response", newdata = titanic_test)Melihat sebaran peluang prediksi data.
library(ggplot2)
ggplot(titanic_test, aes(x=prob_surv)) +
geom_density(lwd=0.5) +
labs(title = "Distribution of Probability Prediction Data") +
theme_minimal()Pada grafik diatas, dapat diinterpretasikan bahwa hasil prediksi yang dilakukan lebih condong ke 0
titanic_test$pred_surv <- factor(ifelse(titanic_test$prob_surv > 0.5, yes = 1, no = 0))
titanic_test[1:10, c("pred_surv", "Survived")]#> pred_surv Survived
#> 3 1 1
#> 4 1 1
#> 5 0 0
#> 6 0 0
#> 7 0 0
#> 8 0 0
#> 16 1 1
#> 17 0 0
#> 19 1 0
#> 21 0 0
conf <- confusionMatrix(titanic_test$pred_surv, titanic_test$Survived, positive = "1")
conf#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 149 28
#> 1 21 70
#>
#> Accuracy : 0.8172
#> 95% CI : (0.7656, 0.8616)
#> No Information Rate : 0.6343
#> P-Value [Acc > NIR] : 0.00000000004661
#>
#> Kappa : 0.5998
#>
#> Mcnemar's Test P-Value : 0.3914
#>
#> Sensitivity : 0.7143
#> Specificity : 0.8765
#> Pos Pred Value : 0.7692
#> Neg Pred Value : 0.8418
#> Prevalence : 0.3657
#> Detection Rate : 0.2612
#> Detection Prevalence : 0.3396
#> Balanced Accuracy : 0.7954
#>
#> 'Positive' Class : 1
#>
Berdasarkan hasil confusionMatrix diatas, dapat kita ambil informasi bahwa kemampuan model dalam menebak target Y ( Survived dan Not Not Survived) sebesar 81,7%. Sedangkan dari keluruhan data aktual orang yang Survived, model dapat mampu menebak benar sebesar 87,6%. Dari keseluruhan data aktual orang yang Survived, model mampu menebak dengan benar sebesar 71,4%. Dari keseluruhan hasil prediksi yang mampu ditebak oleh model, model mampu menebak benar kelas positif sebesar 76,9%.
titanic_knn <- read.csv("train.csv")
titanic_knn <- titanic_knn[, -1]
titanic_knn$Survived <- as.factor(titanic_knn$Survived)
titanic_knn$Age[is.na(titanic_knn$Age)] <- mean(titanic_knn$Age, na.rm = T)
head(titanic_knn)#> Survived Pclass Name Sex
#> 1 0 3 Braund, Mr. Owen Harris male
#> 2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
#> 3 1 3 Heikkinen, Miss. Laina female
#> 4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female
#> 5 0 3 Allen, Mr. William Henry male
#> 6 0 3 Moran, Mr. James male
#> Age SibSp Parch Ticket Fare Cabin Embarked
#> 1 22.00000 1 0 A/5 21171 7.2500 S
#> 2 38.00000 1 0 PC 17599 71.2833 C85 C
#> 3 26.00000 0 0 STON/O2. 3101282 7.9250 S
#> 4 35.00000 1 0 113803 53.1000 C123 S
#> 5 35.00000 0 0 373450 8.0500 S
#> 6 29.69912 0 0 330877 8.4583 Q
RNGkind(sample.kind = "Rounding")
set.seed(100)
index <- sample(nrow(titanic_knn), nrow(titanic_knn)*0.7)
knn_train <- titanic_knn[index,]
knn_test <- titanic_knn[-index,]prop.table(table(knn_train$Survived))#>
#> 0 1
#> 0.6083467 0.3916533
Untuk k-NN, dipisahkan antara prediktor dan label (target variabelnya).
library(dplyr)
knn_train_real <- knn_train %>%
dplyr::select(c(Survived,Pclass,Age,SibSp,Parch,Fare))
knn_test_real <- knn_test %>%
dplyr::select(c(Survived,Pclass,Age,SibSp,Parch,Fare))train_x <- knn_train_real[,-1]
test_x <- knn_test_real[,-1]
train_y <- knn_train_real$Survived
test_y <- knn_test_real$SurvivedData prediktor akan discaling menggunakan z-score standarization. Data test juga harus discaling menggunakan parameter dari data train (karena menganggap data test adalah unseen data).
train_xs <- scale(train_x)
test_xs <- scale(test_x,
center = attr(train_xs,"scaled:center") ,
scale = attr(train_xs,"scaled:scale"))Mencari nilai optimum K
sqrt(nrow(train_xs))#> [1] 24.95997
k-NN tidak membuat model sehingga langsung ke predict.
pred_knn <- knn(train = train_xs,
test = test_xs,
cl = train_y,
k = 23)Cek hasil prediksi
pred_knn#> [1] 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0
#> [38] 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1
#> [75] 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0
#> [112] 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 0 1 1
#> [149] 1 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1
#> [186] 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0
#> [223] 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0
#> [260] 0 0 0 1 0 0 0 0 0
#> Levels: 0 1
Evaluasi Model
confusionMatrix(data = pred_knn, reference = test_y, positive = "1")#> Confusion Matrix and Statistics
#>
#> Reference
#> Prediction 0 1
#> 0 136 42
#> 1 34 56
#>
#> Accuracy : 0.7164
#> 95% CI : (0.6584, 0.7696)
#> No Information Rate : 0.6343
#> P-Value [Acc > NIR] : 0.002792
#>
#> Kappa : 0.378
#>
#> Mcnemar's Test P-Value : 0.422001
#>
#> Sensitivity : 0.5714
#> Specificity : 0.8000
#> Pos Pred Value : 0.6222
#> Neg Pred Value : 0.7640
#> Prevalence : 0.3657
#> Detection Rate : 0.2090
#> Detection Prevalence : 0.3358
#> Balanced Accuracy : 0.6857
#>
#> 'Positive' Class : 1
#>
Berdasarkan hasil confusionMatrix diatas, dapat kita ambil informasi bahwa kemampuan model dalam menebak target Y ( Survived dan Not Not Survived) sebesar 71,6%. Sedangkan dari keluruhan data aktual orang yang Survived, model dapat mampu menebak benar sebesar 80%. Dari keseluruhan data aktual orang yang Survived, model mampu menebak dengan benar sebesar 57,1%. Dari keseluruhan hasil prediksi yang mampu ditebak oleh model, model mampu menebak benar kelas positif sebesar 62,2%.
Saya akan sangat melihat metric precision yang ada, dimana saya tidak ingin model yang saya buat salah dalam memprediksi penumpang yang selamat atau tidak.