1 Data Preparation

1.1 Load the required package

library(dplyr)
library(gtools)
library(caret)
library(class)

1.2 Load Dataset

titanic <- read.csv("train.csv")
str(titanic)
#> 'data.frame':    891 obs. of  12 variables:
#>  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
#>  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
#>  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
#>  $ Sex        : chr  "male" "female" "female" "female" ...
#>  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
#>  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
#>  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
#>  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#>  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
#>  $ Cabin      : chr  "" "C85" "" "C123" ...
#>  $ Embarked   : chr  "S" "C" "S" "S" ...

Keterangan Data

  • PassangerId = Nomor Identitas Penumpang
  • Survived = Survival 0 = No, 1 = Yes
  • Pclass = Tipe Kelas Tiket 1 = 1st, 2 = 2nd, 3 = 3rd
  • Name = Nama Lengkap Penumpang
  • Sex = Jenis Kelamin
  • Age = Usia Dalam Tahun
  • SibSp = Jumlah Saudara Yang Naik Ke Titanic
  • Parch = Jumlah Orang Tua / Anak Yang Naik Ke Titanic
  • Ticket = Nomor Tiket
  • Fare = Tarif Penumpang
  • Cabin = Nomor Kabin
  • Embarked = Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton
head(titanic)
#>   PassengerId Survived Pclass
#> 1           1        0      3
#> 2           2        1      1
#> 3           3        1      3
#> 4           4        1      1
#> 5           5        0      3
#> 6           6        0      3
#>                                                  Name    Sex Age SibSp Parch
#> 1                             Braund, Mr. Owen Harris   male  22     1     0
#> 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
#> 3                              Heikkinen, Miss. Laina female  26     0     0
#> 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
#> 5                            Allen, Mr. William Henry   male  35     0     0
#> 6                                    Moran, Mr. James   male  NA     0     0
#>             Ticket    Fare Cabin Embarked
#> 1        A/5 21171  7.2500              S
#> 2         PC 17599 71.2833   C85        C
#> 3 STON/O2. 3101282  7.9250              S
#> 4           113803 53.1000  C123        S
#> 5           373450  8.0500              S
#> 6           330877  8.4583              Q

1.3 Data Wrangling

titanic <- titanic %>% 
  mutate(Survived = as.factor(Survived),
         Sex = as.factor(Sex),
         Pclass = as.factor(Pclass),
         Embarked = as.factor(Embarked),
         SibSp = as.factor(SibSp),
         Parch = as.factor(Parch))
glimpse(titanic)
#> Rows: 891
#> Columns: 12
#> $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
#> $ Survived    <fct> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
#> $ Pclass      <fct> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
#> $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
#> $ Sex         <fct> male, female, female, female, male, male, male, male, fema…
#> $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
#> $ SibSp       <fct> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
#> $ Parch       <fct> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
#> $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
#> $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
#> $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
#> $ Embarked    <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S, C…

Selanjutnya yaitu melakukan pengecekan terhadap missing value. Missing value perlu kita cek terlebih dahulu

colSums(is.na(titanic))
#> PassengerId    Survived      Pclass        Name         Sex         Age 
#>           0           0           0           0           0         177 
#>       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
#>           0           0           0           0           0           0
titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm = T)
colSums(is.na(titanic))
#> PassengerId    Survived      Pclass        Name         Sex         Age 
#>           0           0           0           0           0           0 
#>       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
#>           0           0           0           0           0           0
prop.table(table(titanic$Survived))
#> 
#>         0         1 
#> 0.6161616 0.3838384
table(titanic$Survived)
#> 
#>   0   1 
#> 549 342

Jika dilihat dari proporsi kedua kelas, sudah cukup seimbang, sehingga kita tidak terlalu membutuhkan pre-processing tambahan untuk menyeimbangkan proporsi antar dua kelas target variabel.

1.4 Splitting Data Train-Test

RNGkind(sample.kind = "Rounding")
set.seed(100) 

index <- sample(nrow(titanic), nrow(titanic)*0.7)

titanic_train <- titanic[index,]
titanic_test <- titanic[-index,]

2 Modelling

model <- glm(formula = Survived~Age+Sex+Fare+Cabin+Embarked, family = "binomial", data = titanic_train)

summary(model)
#> 
#> Call:
#> glm(formula = Survived ~ Age + Sex + Fare + Cabin + Embarked, 
#>     family = "binomial", data = titanic_train)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.7715  -0.5810  -0.4457   0.6425   2.4236  
#> 
#> Coefficients: (1 not defined because of singularities)
#>                     Estimate  Std. Error z value             Pr(>|z|)    
#> (Intercept)         1.006414    0.344985   2.917              0.00353 ** 
#> Age                -0.028168    0.010190  -2.764              0.00571 ** 
#> Sexmale            -2.242630    0.231418  -9.691 < 0.0000000000000002 ***
#> Fare                0.009544    0.004766   2.003              0.04522 *  
#> CabinA10          -17.136000 6522.638610  -0.003              0.99790    
#> CabinA16           18.096529 6522.638607   0.003              0.99779    
#> CabinA19          -16.741428 6522.638605  -0.003              0.99795    
#> CabinA20           20.201945 6522.638623   0.003              0.99753    
#> CabinA23           21.769393 6522.638619   0.003              0.99734    
#> CabinA24          -16.938562 6522.638607  -0.003              0.99793    
#> CabinA31           20.195892 6522.638604   0.003              0.99753    
#> CabinA32          -16.970473 6522.638607  -0.003              0.99792    
#> CabinA34           19.133740 6522.638610   0.003              0.99766    
#> CabinA6            20.252185 6522.638613   0.003              0.99752    
#> CabinB101          15.461481 6522.639034   0.002              0.99811    
#> CabinB102         -16.493297 6522.638605  -0.003              0.99798    
#> CabinB19          -15.931332 6522.638614  -0.002              0.99805    
#> CabinB20           19.126186 3811.622270   0.005              0.99600    
#> CabinB22           17.896101 6522.638617   0.003              0.99781    
#> CabinB28           18.542569 6522.638612   0.003              0.99773    
#> CabinB3            16.753962 6522.638661   0.003              0.99795    
#> CabinB30          -16.527704 6522.638623  -0.003              0.99798    
#> CabinB35           17.137062 6522.638606   0.003              0.99790    
#> CabinB38          -16.315687 6522.638607  -0.003              0.99800    
#> CabinB42           17.808535 6522.638598   0.003              0.99782    
#> CabinB49           16.788374 6522.638625   0.003              0.99795    
#> CabinB5            16.173688 4593.637659   0.004              0.99719    
#> CabinB50           19.975322 6522.638605   0.003              0.99756    
#> CabinB51 B53 B55   -0.479194    3.012873  -0.159              0.87363    
#> CabinB58 B60       -1.642364    1.871106  -0.878              0.38008    
#> CabinB73           17.512366 6522.638607   0.003              0.99786    
#> CabinB77           17.663674 6522.638605   0.003              0.99784    
#> CabinB78           16.560635 6522.638630   0.003              0.99797    
#> CabinB80           17.357805 6522.638636   0.003              0.99788    
#> CabinB82 B84      -17.227237 6522.638618  -0.003              0.99789    
#> CabinB94          -16.203145 6522.638607  -0.002              0.99802    
#> CabinB96 B98       18.467029 2774.557465   0.007              0.99469    
#> CabinC101          18.561249 6522.638619   0.003              0.99773    
#> CabinC103          18.939999 6522.638605   0.003              0.99768    
#> CabinC104          20.975927 6522.638615   0.003              0.99743    
#> CabinC106          20.347763 6522.638598   0.003              0.99751    
#> CabinC123         -16.794409 6522.638608  -0.003              0.99795    
#> CabinC124         -16.746677 6522.638605  -0.003              0.99795    
#> CabinC125          17.728808 6522.638633   0.003              0.99783    
#> CabinC126          17.899947 6522.638599   0.003              0.99781    
#> CabinC128         -16.827320 6522.638605  -0.003              0.99794    
#> CabinC148          19.811088 6522.638602   0.003              0.99758    
#> CabinC22 C26       -2.295112    1.476319  -1.555              0.12004    
#> CabinC23 C25 C27   -1.537441    1.704236  -0.902              0.36699    
#> CabinC45           16.021387 6522.638675   0.002              0.99804    
#> CabinC47           19.918147 6522.638604   0.003              0.99756    
#> CabinC52           20.337599 6522.638600   0.003              0.99751    
#> CabinC65           -0.868704    1.722350  -0.504              0.61400    
#> CabinC68          -17.445104 6522.638626  -0.003              0.99787    
#> CabinC7            16.859445 6522.638635   0.003              0.99794    
#> CabinC70           18.785668 6522.638632   0.003              0.99770    
#> CabinC78           -0.246617    1.787523  -0.138              0.89027    
#> CabinC82          -19.025029 6522.638672  -0.003              0.99767    
#> CabinC83           17.748878 6522.638605   0.003              0.99783    
#> CabinC85           17.512482 6522.638607   0.003              0.99786    
#> CabinC86          -17.374388 6522.638625  -0.003              0.99787    
#> CabinC90           17.325984 6522.638603   0.003              0.99788    
#> CabinC91          -17.724052 6522.638637  -0.003              0.99783    
#> CabinC92           18.855636 3764.985305   0.005              0.99600    
#> CabinC95          -18.609850 6522.638677  -0.003              0.99772    
#> CabinD             18.089031 4531.840532   0.004              0.99682    
#> CabinD10 D12       19.408230 6522.638617   0.003              0.99763    
#> CabinD11           18.252210 6522.638622   0.003              0.99777    
#> CabinD15           17.295678 6522.638622   0.003              0.99788    
#> CabinD17           18.678389 4612.106401   0.004              0.99677    
#> CabinD19           20.483775 6522.638613   0.003              0.99749    
#> CabinD21           17.901259 6522.638599   0.003              0.99781    
#> CabinD26          -17.072076 4513.176240  -0.004              0.99698    
#> CabinD28           17.634323 6522.638599   0.003              0.99784    
#> CabinD30          -17.301427 6522.638608  -0.003              0.99788    
#> CabinD35           20.342937 6522.638600   0.003              0.99751    
#> CabinD36           16.689218 6522.638618   0.003              0.99796    
#> CabinD37           18.094314 6522.638615   0.003              0.99779    
#> CabinD46          -16.330650 6522.638608  -0.003              0.99800    
#> CabinD47           17.844005 6522.638601   0.003              0.99782    
#> CabinD48          -17.214420 6522.638630  -0.003              0.99789    
#> CabinD56           20.635920 6522.638610   0.003              0.99748    
#> CabinD6           -16.799295 6522.638605  -0.003              0.99795    
#> CabinD7            18.590222 6522.638627   0.003              0.99773    
#> CabinE101          18.286738 4610.094498   0.004              0.99684    
#> CabinE12           20.900953 6522.638613   0.003              0.99744    
#> CabinE121          19.852236 6522.638601   0.003              0.99757    
#> CabinE17           20.985456 6522.638617   0.003              0.99743    
#> CabinE24           20.638689 4607.528613   0.004              0.99643    
#> CabinE25           20.564969 4612.201890   0.004              0.99644    
#> CabinE31          -16.617964 6522.638610  -0.003              0.99797    
#> CabinE33           17.766299 4606.551251   0.004              0.99692    
#> CabinE34           16.965508 6522.638623   0.003              0.99792    
#> CabinE36           17.424129 6522.638617   0.003              0.99787    
#> CabinE40           16.993675 6522.638639   0.003              0.99792    
#> CabinE44           18.124711 6522.638616   0.003              0.99778    
#> CabinE58          -16.250166 6522.638608  -0.002              0.99801    
#> CabinE67            0.636389    1.762514   0.361              0.71805    
#> CabinE68           17.306532 6522.638607   0.003              0.99788    
#> CabinE77          -18.067132 6522.638613  -0.003              0.99779    
#> CabinF E69         17.745582 6522.638603   0.003              0.99783    
#> CabinF G63        -16.219818 6522.638607  -0.002              0.99802    
#> CabinF G73        -16.867674 6522.638605  -0.003              0.99794    
#> CabinF2             1.544397    1.466622   1.053              0.29233    
#> CabinF33           18.195954 4608.987285   0.004              0.99685    
#> CabinF38          -17.154317 6522.638613  -0.003              0.99790    
#> CabinF4            18.572375 3965.646422   0.005              0.99626    
#> CabinG6            -0.720965    1.039707  -0.693              0.48804    
#> EmbarkedC           0.437251    0.312132   1.401              0.16126    
#> EmbarkedQ           0.587058    0.369221   1.590              0.11184    
#> EmbarkedS                 NA          NA      NA                   NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 834.17  on 622  degrees of freedom
#> Residual deviance: 490.73  on 513  degrees of freedom
#> AIC: 710.73
#> 
#> Number of Fisher Scoring iterations: 17

Pada pemodelan yang pertama, masih banyak variabel prediktor yang tidak signifikan terhadap target variabel, oleh karena itu kita akan coba melakukan model fitting menggunakan metode stepwise.

library(MASS)
model2 <- stepAIC(model, direction = "backward")
#> Start:  AIC=710.73
#> Survived ~ Age + Sex + Fare + Cabin + Embarked
#> 
#>             Df Deviance    AIC
#> - Cabin    103   637.17 651.17
#> - Embarked   2   494.61 710.61
#> <none>           490.73 710.73
#> - Fare       1   496.19 714.19
#> - Age        1   498.75 716.75
#> - Sex        1   594.49 812.49
#> 
#> Step:  AIC=651.17
#> Survived ~ Age + Sex + Fare + Embarked
#> 
#>            Df Deviance    AIC
#> - Embarked  3   641.95 649.95
#> <none>          637.17 651.17
#> - Age       1   639.90 651.90
#> - Fare      1   652.54 664.54
#> - Sex       1   778.05 790.05
#> 
#> Step:  AIC=649.95
#> Survived ~ Age + Sex + Fare
#> 
#>        Df Deviance    AIC
#> <none>      641.95 649.95
#> - Age   1   644.57 650.57
#> - Fare  1   661.57 667.57
#> - Sex   1   789.18 795.18

Dengan menggunakan metode backward pada stepwise, kita memperoleh model sebagai berikut.

summary(model2)
#> 
#> Call:
#> glm(formula = Survived ~ Age + Sex + Fare, family = "binomial", 
#>     data = titanic_train)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -2.1999  -0.6555  -0.6073   0.8285   2.0577  
#> 
#> Coefficients:
#>              Estimate Std. Error z value             Pr(>|z|)    
#> (Intercept)  1.026064   0.271342   3.781             0.000156 ***
#> Age         -0.012556   0.007816  -1.606             0.108193    
#> Sexmale     -2.272862   0.199458 -11.395 < 0.0000000000000002 ***
#> Fare         0.008747   0.002291   3.817             0.000135 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 834.17  on 622  degrees of freedom
#> Residual deviance: 641.95  on 619  degrees of freedom
#> AIC: 649.95
#> 
#> Number of Fisher Scoring iterations: 4

3 Prediksi

Dengan menggunakan model2 hasil dari stepwise, kita akan coba prediksi menggunakan data test yang sudah kita miliki.

titanic_test$prob_surv <- predict(model2, type = "response", newdata = titanic_test)

Melihat sebaran peluang prediksi data.

library(ggplot2)

ggplot(titanic_test, aes(x=prob_surv)) +
  geom_density(lwd=0.5) +
  labs(title = "Distribution of Probability Prediction Data") +
  theme_minimal()

Pada grafik diatas, dapat diinterpretasikan bahwa hasil prediksi yang dilakukan lebih condong ke 0

titanic_test$pred_surv <- factor(ifelse(titanic_test$prob_surv > 0.5, yes = 1, no = 0))
titanic_test[1:10, c("pred_surv", "Survived")]
#>    pred_surv Survived
#> 3          1        1
#> 4          1        1
#> 5          0        0
#> 6          0        0
#> 7          0        0
#> 8          0        0
#> 16         1        1
#> 17         0        0
#> 19         1        0
#> 21         0        0

4 Model Evaluation

conf <- confusionMatrix(titanic_test$pred_surv, titanic_test$Survived, positive = "1")
conf
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 149  28
#>          1  21  70
#>                                           
#>                Accuracy : 0.8172          
#>                  95% CI : (0.7656, 0.8616)
#>     No Information Rate : 0.6343          
#>     P-Value [Acc > NIR] : 0.00000000004661
#>                                           
#>                   Kappa : 0.5998          
#>                                           
#>  Mcnemar's Test P-Value : 0.3914          
#>                                           
#>             Sensitivity : 0.7143          
#>             Specificity : 0.8765          
#>          Pos Pred Value : 0.7692          
#>          Neg Pred Value : 0.8418          
#>              Prevalence : 0.3657          
#>          Detection Rate : 0.2612          
#>    Detection Prevalence : 0.3396          
#>       Balanced Accuracy : 0.7954          
#>                                           
#>        'Positive' Class : 1               
#> 
  • Re-call/Sensitivity = dari semua data aktual yang positif, seberapa mampu proporsi model saya menebak benar.
  • Specificity = dari semua data aktual yang negatif, seberapa mampu proporsi model saya menebak yang benar.
  • Accuracy = seberapa mampu model saya menebak dengan benar target Y.
  • Precision = dari semua hasil prediksi, seberapa mampu model saya dapat menebak benar kelas positif.

Berdasarkan hasil confusionMatrix diatas, dapat kita ambil informasi bahwa kemampuan model dalam menebak target Y ( Survived dan Not Not Survived) sebesar 81,7%. Sedangkan dari keluruhan data aktual orang yang Survived, model dapat mampu menebak benar sebesar 87,6%. Dari keseluruhan data aktual orang yang Survived, model mampu menebak dengan benar sebesar 71,4%. Dari keseluruhan hasil prediksi yang mampu ditebak oleh model, model mampu menebak benar kelas positif sebesar 76,9%.

5 K-Nearest Neighbour

5.1 Pre-Processing Data

titanic_knn <- read.csv("train.csv")
titanic_knn <- titanic_knn[, -1]
titanic_knn$Survived <- as.factor(titanic_knn$Survived)
titanic_knn$Age[is.na(titanic_knn$Age)] <- mean(titanic_knn$Age, na.rm = T)
head(titanic_knn)
#>   Survived Pclass                                                Name    Sex
#> 1        0      3                             Braund, Mr. Owen Harris   male
#> 2        1      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
#> 3        1      3                              Heikkinen, Miss. Laina female
#> 4        1      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
#> 5        0      3                            Allen, Mr. William Henry   male
#> 6        0      3                                    Moran, Mr. James   male
#>        Age SibSp Parch           Ticket    Fare Cabin Embarked
#> 1 22.00000     1     0        A/5 21171  7.2500              S
#> 2 38.00000     1     0         PC 17599 71.2833   C85        C
#> 3 26.00000     0     0 STON/O2. 3101282  7.9250              S
#> 4 35.00000     1     0           113803 53.1000  C123        S
#> 5 35.00000     0     0           373450  8.0500              S
#> 6 29.69912     0     0           330877  8.4583              Q
RNGkind(sample.kind = "Rounding")
set.seed(100)

index <- sample(nrow(titanic_knn), nrow(titanic_knn)*0.7)

knn_train <- titanic_knn[index,]
knn_test <- titanic_knn[-index,]
prop.table(table(knn_train$Survived))
#> 
#>         0         1 
#> 0.6083467 0.3916533

Untuk k-NN, dipisahkan antara prediktor dan label (target variabelnya).

library(dplyr)

knn_train_real <- knn_train %>% 
  dplyr::select(c(Survived,Pclass,Age,SibSp,Parch,Fare))

knn_test_real <- knn_test %>% 
  dplyr::select(c(Survived,Pclass,Age,SibSp,Parch,Fare))
train_x <- knn_train_real[,-1]

test_x <- knn_test_real[,-1]

train_y <- knn_train_real$Survived

test_y <- knn_test_real$Survived

Data prediktor akan discaling menggunakan z-score standarization. Data test juga harus discaling menggunakan parameter dari data train (karena menganggap data test adalah unseen data).

train_xs <- scale(train_x)
test_xs <- scale(test_x,
                      center = attr(train_xs,"scaled:center") ,
                      scale = attr(train_xs,"scaled:scale"))

Mencari nilai optimum K

sqrt(nrow(train_xs))
#> [1] 24.95997

k-NN tidak membuat model sehingga langsung ke predict.

pred_knn <- knn(train = train_xs,
                 test = test_xs,
                 cl = train_y,
                 k = 23)

Cek hasil prediksi

pred_knn
#>   [1] 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0
#>  [38] 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1
#>  [75] 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0
#> [112] 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 0 1 1
#> [149] 1 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1
#> [186] 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0
#> [223] 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0
#> [260] 0 0 0 1 0 0 0 0 0
#> Levels: 0 1

Evaluasi Model

confusionMatrix(data = pred_knn, reference = test_y, positive = "1")
#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 136  42
#>          1  34  56
#>                                           
#>                Accuracy : 0.7164          
#>                  95% CI : (0.6584, 0.7696)
#>     No Information Rate : 0.6343          
#>     P-Value [Acc > NIR] : 0.002792        
#>                                           
#>                   Kappa : 0.378           
#>                                           
#>  Mcnemar's Test P-Value : 0.422001        
#>                                           
#>             Sensitivity : 0.5714          
#>             Specificity : 0.8000          
#>          Pos Pred Value : 0.6222          
#>          Neg Pred Value : 0.7640          
#>              Prevalence : 0.3657          
#>          Detection Rate : 0.2090          
#>    Detection Prevalence : 0.3358          
#>       Balanced Accuracy : 0.6857          
#>                                           
#>        'Positive' Class : 1               
#> 

Berdasarkan hasil confusionMatrix diatas, dapat kita ambil informasi bahwa kemampuan model dalam menebak target Y ( Survived dan Not Not Survived) sebesar 71,6%. Sedangkan dari keluruhan data aktual orang yang Survived, model dapat mampu menebak benar sebesar 80%. Dari keseluruhan data aktual orang yang Survived, model mampu menebak dengan benar sebesar 57,1%. Dari keseluruhan hasil prediksi yang mampu ditebak oleh model, model mampu menebak benar kelas positif sebesar 62,2%.

6 Conclusion

Saya akan sangat melihat metric precision yang ada, dimana saya tidak ingin model yang saya buat salah dalam memprediksi penumpang yang selamat atau tidak.