1 Data Preparation

1.1 Load the required package

library(dplyr)
library(gtools)
library(caret)
library(class)

1.2 Load Dataset

titanic <- read.csv("train.csv")
str(titanic)

#> 'data.frame':    891 obs. of  12 variables:
#>  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
#>  $ Survived   : int  0 1 1 1 0 0 0 0 1 1 ...
#>  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
#>  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
#>  $ Sex        : chr  "male" "female" "female" "female" ...
#>  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
#>  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
#>  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
#>  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
#>  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
#>  $ Cabin      : chr  "" "C85" "" "C123" ...
#>  $ Embarked   : chr  "S" "C" "S" "S" ...

Keterangan Data

PassangerId = Nomor Identitas Penumpang
Survived = Survival 0 = No, 1 = Yes
Pclass = Tipe Kelas Tiket 1 = 1st, 2 = 2nd, 3 = 3rd
Name = Nama Lengkap Penumpang
Sex = Jenis Kelamin
Age = Usia Dalam Tahun
SibSp = Jumlah Saudara Yang Naik Ke Titanic
Parch = Jumlah Orang Tua / Anak Yang Naik Ke Titanic
Ticket = Nomor Tiket
Fare = Tarif Penumpang
Cabin = Nomor Kabin
Embarked = Port of Embarkation C = Cherbourg, Q = Queenstown, S = Southampton

head(titanic)

#>   PassengerId Survived Pclass
#> 1           1        0      3
#> 2           2        1      1
#> 3           3        1      3
#> 4           4        1      1
#> 5           5        0      3
#> 6           6        0      3
#>                                                  Name    Sex Age SibSp Parch
#> 1                             Braund, Mr. Owen Harris   male  22     1     0
#> 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
#> 3                              Heikkinen, Miss. Laina female  26     0     0
#> 4        Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
#> 5                            Allen, Mr. William Henry   male  35     0     0
#> 6                                    Moran, Mr. James   male  NA     0     0
#>             Ticket    Fare Cabin Embarked
#> 1        A/5 21171  7.2500              S
#> 2         PC 17599 71.2833   C85        C
#> 3 STON/O2. 3101282  7.9250              S
#> 4           113803 53.1000  C123        S
#> 5           373450  8.0500              S
#> 6           330877  8.4583              Q

1.3 Data Wrangling

titanic <- titanic %>% 
  mutate(Survived = as.factor(Survived),
         Sex = as.factor(Sex),
         Pclass = as.factor(Pclass),
         Embarked = as.factor(Embarked),
         SibSp = as.factor(SibSp),
         Parch = as.factor(Parch))
glimpse(titanic)

#> Rows: 891
#> Columns: 12
#> $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,…
#> $ Survived    <fct> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 1…
#> $ Pclass      <fct> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, 3…
#> $ Name        <chr> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (Fl…
#> $ Sex         <fct> male, female, female, female, male, male, male, male, fema…
#> $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14, …
#> $ SibSp       <fct> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, 0…
#> $ Parch       <fct> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0…
#> $ Ticket      <chr> "A/5 21171", "PC 17599", "STON/O2. 3101282", "113803", "37…
#> $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625,…
#> $ Cabin       <chr> "", "C85", "", "C123", "", "", "E46", "", "", "", "G6", "C…
#> $ Embarked    <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S, C…

Selanjutnya yaitu melakukan pengecekan terhadap missing value. Missing value perlu kita cek terlebih dahulu

colSums(is.na(titanic))

#> PassengerId    Survived      Pclass        Name         Sex         Age 
#>           0           0           0           0           0         177 
#>       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
#>           0           0           0           0           0           0

titanic$Age[is.na(titanic$Age)] <- mean(titanic$Age, na.rm = T)
colSums(is.na(titanic))

#> PassengerId    Survived      Pclass        Name         Sex         Age 
#>           0           0           0           0           0           0 
#>       SibSp       Parch      Ticket        Fare       Cabin    Embarked 
#>           0           0           0           0           0           0

prop.table(table(titanic$Survived))

#> 
#>         0         1 
#> 0.6161616 0.3838384

table(titanic$Survived)

#> 
#>   0   1 
#> 549 342

Jika dilihat dari proporsi kedua kelas, sudah cukup seimbang, sehingga kita tidak terlalu membutuhkan pre-processing tambahan untuk menyeimbangkan proporsi antar dua kelas target variabel.

1.4 Splitting Data Train-Test

RNGkind(sample.kind = "Rounding")
set.seed(100) 

index <- sample(nrow(titanic), nrow(titanic)*0.7)

titanic_train <- titanic[index,]
titanic_test <- titanic[-index,]

2 Modelling

model <- glm(formula = Survived~Age+Sex+Fare+Cabin+Embarked, family = "binomial", data = titanic_train)

summary(model)

#> 
#> Call:
#> glm(formula = Survived ~ Age + Sex + Fare + Cabin + Embarked, 
#>     family = "binomial", data = titanic_train)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -1.7715  -0.5810  -0.4457   0.6425   2.4236  
#> 
#> Coefficients: (1 not defined because of singularities)
#>                     Estimate  Std. Error z value             Pr(>|z|)    
#> (Intercept)         1.006414    0.344985   2.917              0.00353 ** 
#> Age                -0.028168    0.010190  -2.764              0.00571 ** 
#> Sexmale            -2.242630    0.231418  -9.691 < 0.0000000000000002 ***
#> Fare                0.009544    0.004766   2.003              0.04522 *  
#> CabinA10          -17.136000 6522.638610  -0.003              0.99790    
#> CabinA16           18.096529 6522.638607   0.003              0.99779    
#> CabinA19          -16.741428 6522.638605  -0.003              0.99795    
#> CabinA20           20.201945 6522.638623   0.003              0.99753    
#> CabinA23           21.769393 6522.638619   0.003              0.99734    
#> CabinA24          -16.938562 6522.638607  -0.003              0.99793    
#> CabinA31           20.195892 6522.638604   0.003              0.99753    
#> CabinA32          -16.970473 6522.638607  -0.003              0.99792    
#> CabinA34           19.133740 6522.638610   0.003              0.99766    
#> CabinA6            20.252185 6522.638613   0.003              0.99752    
#> CabinB101          15.461481 6522.639034   0.002              0.99811    
#> CabinB102         -16.493297 6522.638605  -0.003              0.99798    
#> CabinB19          -15.931332 6522.638614  -0.002              0.99805    
#> CabinB20           19.126186 3811.622270   0.005              0.99600    
#> CabinB22           17.896101 6522.638617   0.003              0.99781    
#> CabinB28           18.542569 6522.638612   0.003              0.99773    
#> CabinB3            16.753962 6522.638661   0.003              0.99795    
#> CabinB30          -16.527704 6522.638623  -0.003              0.99798    
#> CabinB35           17.137062 6522.638606   0.003              0.99790    
#> CabinB38          -16.315687 6522.638607  -0.003              0.99800    
#> CabinB42           17.808535 6522.638598   0.003              0.99782    
#> CabinB49           16.788374 6522.638625   0.003              0.99795    
#> CabinB5            16.173688 4593.637659   0.004              0.99719    
#> CabinB50           19.975322 6522.638605   0.003              0.99756    
#> CabinB51 B53 B55   -0.479194    3.012873  -0.159              0.87363    
#> CabinB58 B60       -1.642364    1.871106  -0.878              0.38008    
#> CabinB73           17.512366 6522.638607   0.003              0.99786    
#> CabinB77           17.663674 6522.638605   0.003              0.99784    
#> CabinB78           16.560635 6522.638630   0.003              0.99797    
#> CabinB80           17.357805 6522.638636   0.003              0.99788    
#> CabinB82 B84      -17.227237 6522.638618  -0.003              0.99789    
#> CabinB94          -16.203145 6522.638607  -0.002              0.99802    
#> CabinB96 B98       18.467029 2774.557465   0.007              0.99469    
#> CabinC101          18.561249 6522.638619   0.003              0.99773    
#> CabinC103          18.939999 6522.638605   0.003              0.99768    
#> CabinC104          20.975927 6522.638615   0.003              0.99743    
#> CabinC106          20.347763 6522.638598   0.003              0.99751    
#> CabinC123         -16.794409 6522.638608  -0.003              0.99795    
#> CabinC124         -16.746677 6522.638605  -0.003              0.99795    
#> CabinC125          17.728808 6522.638633   0.003              0.99783    
#> CabinC126          17.899947 6522.638599   0.003              0.99781    
#> CabinC128         -16.827320 6522.638605  -0.003              0.99794    
#> CabinC148          19.811088 6522.638602   0.003              0.99758    
#> CabinC22 C26       -2.295112    1.476319  -1.555              0.12004    
#> CabinC23 C25 C27   -1.537441    1.704236  -0.902              0.36699    
#> CabinC45           16.021387 6522.638675   0.002              0.99804    
#> CabinC47           19.918147 6522.638604   0.003              0.99756    
#> CabinC52           20.337599 6522.638600   0.003              0.99751    
#> CabinC65           -0.868704    1.722350  -0.504              0.61400    
#> CabinC68          -17.445104 6522.638626  -0.003              0.99787    
#> CabinC7            16.859445 6522.638635   0.003              0.99794    
#> CabinC70           18.785668 6522.638632   0.003              0.99770    
#> CabinC78           -0.246617    1.787523  -0.138              0.89027    
#> CabinC82          -19.025029 6522.638672  -0.003              0.99767    
#> CabinC83           17.748878 6522.638605   0.003              0.99783    
#> CabinC85           17.512482 6522.638607   0.003              0.99786    
#> CabinC86          -17.374388 6522.638625  -0.003              0.99787    
#> CabinC90           17.325984 6522.638603   0.003              0.99788    
#> CabinC91          -17.724052 6522.638637  -0.003              0.99783    
#> CabinC92           18.855636 3764.985305   0.005              0.99600    
#> CabinC95          -18.609850 6522.638677  -0.003              0.99772    
#> CabinD             18.089031 4531.840532   0.004              0.99682    
#> CabinD10 D12       19.408230 6522.638617   0.003              0.99763    
#> CabinD11           18.252210 6522.638622   0.003              0.99777    
#> CabinD15           17.295678 6522.638622   0.003              0.99788    
#> CabinD17           18.678389 4612.106401   0.004              0.99677    
#> CabinD19           20.483775 6522.638613   0.003              0.99749    
#> CabinD21           17.901259 6522.638599   0.003              0.99781    
#> CabinD26          -17.072076 4513.176240  -0.004              0.99698    
#> CabinD28           17.634323 6522.638599   0.003              0.99784    
#> CabinD30          -17.301427 6522.638608  -0.003              0.99788    
#> CabinD35           20.342937 6522.638600   0.003              0.99751    
#> CabinD36           16.689218 6522.638618   0.003              0.99796    
#> CabinD37           18.094314 6522.638615   0.003              0.99779    
#> CabinD46          -16.330650 6522.638608  -0.003              0.99800    
#> CabinD47           17.844005 6522.638601   0.003              0.99782    
#> CabinD48          -17.214420 6522.638630  -0.003              0.99789    
#> CabinD56           20.635920 6522.638610   0.003              0.99748    
#> CabinD6           -16.799295 6522.638605  -0.003              0.99795    
#> CabinD7            18.590222 6522.638627   0.003              0.99773    
#> CabinE101          18.286738 4610.094498   0.004              0.99684    
#> CabinE12           20.900953 6522.638613   0.003              0.99744    
#> CabinE121          19.852236 6522.638601   0.003              0.99757    
#> CabinE17           20.985456 6522.638617   0.003              0.99743    
#> CabinE24           20.638689 4607.528613   0.004              0.99643    
#> CabinE25           20.564969 4612.201890   0.004              0.99644    
#> CabinE31          -16.617964 6522.638610  -0.003              0.99797    
#> CabinE33           17.766299 4606.551251   0.004              0.99692    
#> CabinE34           16.965508 6522.638623   0.003              0.99792    
#> CabinE36           17.424129 6522.638617   0.003              0.99787    
#> CabinE40           16.993675 6522.638639   0.003              0.99792    
#> CabinE44           18.124711 6522.638616   0.003              0.99778    
#> CabinE58          -16.250166 6522.638608  -0.002              0.99801    
#> CabinE67            0.636389    1.762514   0.361              0.71805    
#> CabinE68           17.306532 6522.638607   0.003              0.99788    
#> CabinE77          -18.067132 6522.638613  -0.003              0.99779    
#> CabinF E69         17.745582 6522.638603   0.003              0.99783    
#> CabinF G63        -16.219818 6522.638607  -0.002              0.99802    
#> CabinF G73        -16.867674 6522.638605  -0.003              0.99794    
#> CabinF2             1.544397    1.466622   1.053              0.29233    
#> CabinF33           18.195954 4608.987285   0.004              0.99685    
#> CabinF38          -17.154317 6522.638613  -0.003              0.99790    
#> CabinF4            18.572375 3965.646422   0.005              0.99626    
#> CabinG6            -0.720965    1.039707  -0.693              0.48804    
#> EmbarkedC           0.437251    0.312132   1.401              0.16126    
#> EmbarkedQ           0.587058    0.369221   1.590              0.11184    
#> EmbarkedS                 NA          NA      NA                   NA    
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 834.17  on 622  degrees of freedom
#> Residual deviance: 490.73  on 513  degrees of freedom
#> AIC: 710.73
#> 
#> Number of Fisher Scoring iterations: 17

Pada pemodelan yang pertama, masih banyak variabel prediktor yang tidak signifikan terhadap target variabel, oleh karena itu kita akan coba melakukan model fitting menggunakan metode stepwise.

library(MASS)
model2 <- stepAIC(model, direction = "backward")

#> Start:  AIC=710.73
#> Survived ~ Age + Sex + Fare + Cabin + Embarked
#> 
#>             Df Deviance    AIC
#> - Cabin    103   637.17 651.17
#> - Embarked   2   494.61 710.61
#> <none>           490.73 710.73
#> - Fare       1   496.19 714.19
#> - Age        1   498.75 716.75
#> - Sex        1   594.49 812.49
#> 
#> Step:  AIC=651.17
#> Survived ~ Age + Sex + Fare + Embarked
#> 
#>            Df Deviance    AIC
#> - Embarked  3   641.95 649.95
#> <none>          637.17 651.17
#> - Age       1   639.90 651.90
#> - Fare      1   652.54 664.54
#> - Sex       1   778.05 790.05
#> 
#> Step:  AIC=649.95
#> Survived ~ Age + Sex + Fare
#> 
#>        Df Deviance    AIC
#> <none>      641.95 649.95
#> - Age   1   644.57 650.57
#> - Fare  1   661.57 667.57
#> - Sex   1   789.18 795.18

Dengan menggunakan metode backward pada stepwise, kita memperoleh model sebagai berikut.

summary(model2)

#> 
#> Call:
#> glm(formula = Survived ~ Age + Sex + Fare, family = "binomial", 
#>     data = titanic_train)
#> 
#> Deviance Residuals: 
#>     Min       1Q   Median       3Q      Max  
#> -2.1999  -0.6555  -0.6073   0.8285   2.0577  
#> 
#> Coefficients:
#>              Estimate Std. Error z value             Pr(>|z|)    
#> (Intercept)  1.026064   0.271342   3.781             0.000156 ***
#> Age         -0.012556   0.007816  -1.606             0.108193    
#> Sexmale     -2.272862   0.199458 -11.395 < 0.0000000000000002 ***
#> Fare         0.008747   0.002291   3.817             0.000135 ***
#> ---
#> Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
#> 
#> (Dispersion parameter for binomial family taken to be 1)
#> 
#>     Null deviance: 834.17  on 622  degrees of freedom
#> Residual deviance: 641.95  on 619  degrees of freedom
#> AIC: 649.95
#> 
#> Number of Fisher Scoring iterations: 4

3 Prediksi

Dengan menggunakan model2 hasil dari stepwise, kita akan coba prediksi menggunakan data test yang sudah kita miliki.

titanic_test$prob_surv <- predict(model2, type = "response", newdata = titanic_test)

Melihat sebaran peluang prediksi data.

library(ggplot2)

ggplot(titanic_test, aes(x=prob_surv)) +
  geom_density(lwd=0.5) +
  labs(title = "Distribution of Probability Prediction Data") +
  theme_minimal()

Pada grafik diatas, dapat diinterpretasikan bahwa hasil prediksi yang dilakukan lebih condong ke 0

titanic_test$pred_surv <- factor(ifelse(titanic_test$prob_surv > 0.5, yes = 1, no = 0))
titanic_test[1:10, c("pred_surv", "Survived")]

#>    pred_surv Survived
#> 3          1        1
#> 4          1        1
#> 5          0        0
#> 6          0        0
#> 7          0        0
#> 8          0        0
#> 16         1        1
#> 17         0        0
#> 19         1        0
#> 21         0        0

4 Model Evaluation

conf <- confusionMatrix(titanic_test$pred_surv, titanic_test$Survived, positive = "1")
conf

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 149  28
#>          1  21  70
#>                                           
#>                Accuracy : 0.8172          
#>                  95% CI : (0.7656, 0.8616)
#>     No Information Rate : 0.6343          
#>     P-Value [Acc > NIR] : 0.00000000004661
#>                                           
#>                   Kappa : 0.5998          
#>                                           
#>  Mcnemar's Test P-Value : 0.3914          
#>                                           
#>             Sensitivity : 0.7143          
#>             Specificity : 0.8765          
#>          Pos Pred Value : 0.7692          
#>          Neg Pred Value : 0.8418          
#>              Prevalence : 0.3657          
#>          Detection Rate : 0.2612          
#>    Detection Prevalence : 0.3396          
#>       Balanced Accuracy : 0.7954          
#>                                           
#>        'Positive' Class : 1               
#>

Re-call/Sensitivity = dari semua data aktual yang positif, seberapa mampu proporsi model saya menebak benar.
Specificity = dari semua data aktual yang negatif, seberapa mampu proporsi model saya menebak yang benar.
Accuracy = seberapa mampu model saya menebak dengan benar target Y.
Precision = dari semua hasil prediksi, seberapa mampu model saya dapat menebak benar kelas positif.

Berdasarkan hasil confusionMatrix diatas, dapat kita ambil informasi bahwa kemampuan model dalam menebak target Y ( Survived dan Not Not Survived) sebesar 81,7%. Sedangkan dari keluruhan data aktual orang yang Survived, model dapat mampu menebak benar sebesar 87,6%. Dari keseluruhan data aktual orang yang Survived, model mampu menebak dengan benar sebesar 71,4%. Dari keseluruhan hasil prediksi yang mampu ditebak oleh model, model mampu menebak benar kelas positif sebesar 76,9%.

5 K-Nearest Neighbour

5.1 Pre-Processing Data

titanic_knn <- read.csv("train.csv")
titanic_knn <- titanic_knn[, -1]
titanic_knn$Survived <- as.factor(titanic_knn$Survived)
titanic_knn$Age[is.na(titanic_knn$Age)] <- mean(titanic_knn$Age, na.rm = T)
head(titanic_knn)

#>   Survived Pclass                                                Name    Sex
#> 1        0      3                             Braund, Mr. Owen Harris   male
#> 2        1      1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female
#> 3        1      3                              Heikkinen, Miss. Laina female
#> 4        1      1        Futrelle, Mrs. Jacques Heath (Lily May Peel) female
#> 5        0      3                            Allen, Mr. William Henry   male
#> 6        0      3                                    Moran, Mr. James   male
#>        Age SibSp Parch           Ticket    Fare Cabin Embarked
#> 1 22.00000     1     0        A/5 21171  7.2500              S
#> 2 38.00000     1     0         PC 17599 71.2833   C85        C
#> 3 26.00000     0     0 STON/O2. 3101282  7.9250              S
#> 4 35.00000     1     0           113803 53.1000  C123        S
#> 5 35.00000     0     0           373450  8.0500              S
#> 6 29.69912     0     0           330877  8.4583              Q

RNGkind(sample.kind = "Rounding")
set.seed(100)

index <- sample(nrow(titanic_knn), nrow(titanic_knn)*0.7)

knn_train <- titanic_knn[index,]
knn_test <- titanic_knn[-index,]

prop.table(table(knn_train$Survived))

#> 
#>         0         1 
#> 0.6083467 0.3916533

Untuk k-NN, dipisahkan antara prediktor dan label (target variabelnya).

library(dplyr)

knn_train_real <- knn_train %>% 
  dplyr::select(c(Survived,Pclass,Age,SibSp,Parch,Fare))

knn_test_real <- knn_test %>% 
  dplyr::select(c(Survived,Pclass,Age,SibSp,Parch,Fare))

train_x <- knn_train_real[,-1]

test_x <- knn_test_real[,-1]

train_y <- knn_train_real$Survived

test_y <- knn_test_real$Survived

Data prediktor akan discaling menggunakan z-score standarization. Data test juga harus discaling menggunakan parameter dari data train (karena menganggap data test adalah unseen data).

train_xs <- scale(train_x)
test_xs <- scale(test_x,
                      center = attr(train_xs,"scaled:center") ,
                      scale = attr(train_xs,"scaled:scale"))

Mencari nilai optimum K

sqrt(nrow(train_xs))

#> [1] 24.95997

k-NN tidak membuat model sehingga langsung ke predict.

pred_knn <- knn(train = train_xs,
                 test = test_xs,
                 cl = train_y,
                 k = 23)

Cek hasil prediksi

pred_knn

#>   [1] 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 0 0 0 1 0 1 0 1 1 0 0 0 0 0 0 0 0 0
#>  [38] 0 1 0 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1
#>  [75] 0 0 0 0 1 0 0 0 1 1 1 1 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 1 1 0 1 1 1 0 0 0
#> [112] 0 0 1 1 1 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 1 1 1 0 1 0 0 0 1 1
#> [149] 1 1 1 0 1 0 1 0 0 1 1 1 0 1 1 1 0 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 1
#> [186] 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 1 0 1 0 0 1 0 0 0 0 0 1 0 1 0 1 1 1 0 0 0
#> [223] 0 1 1 0 0 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0
#> [260] 0 0 0 1 0 0 0 0 0
#> Levels: 0 1

Evaluasi Model

confusionMatrix(data = pred_knn, reference = test_y, positive = "1")

#> Confusion Matrix and Statistics
#> 
#>           Reference
#> Prediction   0   1
#>          0 136  42
#>          1  34  56
#>                                           
#>                Accuracy : 0.7164          
#>                  95% CI : (0.6584, 0.7696)
#>     No Information Rate : 0.6343          
#>     P-Value [Acc > NIR] : 0.002792        
#>                                           
#>                   Kappa : 0.378           
#>                                           
#>  Mcnemar's Test P-Value : 0.422001        
#>                                           
#>             Sensitivity : 0.5714          
#>             Specificity : 0.8000          
#>          Pos Pred Value : 0.6222          
#>          Neg Pred Value : 0.7640          
#>              Prevalence : 0.3657          
#>          Detection Rate : 0.2090          
#>    Detection Prevalence : 0.3358          
#>       Balanced Accuracy : 0.6857          
#>                                           
#>        'Positive' Class : 1               
#>

Berdasarkan hasil confusionMatrix diatas, dapat kita ambil informasi bahwa kemampuan model dalam menebak target Y ( Survived dan Not Not Survived) sebesar 71,6%. Sedangkan dari keluruhan data aktual orang yang Survived, model dapat mampu menebak benar sebesar 80%. Dari keseluruhan data aktual orang yang Survived, model mampu menebak dengan benar sebesar 57,1%. Dari keseluruhan hasil prediksi yang mampu ditebak oleh model, model mampu menebak benar kelas positif sebesar 62,2%.

6 Conclusion

Saya akan sangat melihat metric precision yang ada, dimana saya tidak ingin model yang saya buat salah dalam memprediksi penumpang yang selamat atau tidak.

LBB ML2

FURQAN AKBAR NIZAR

2022-12-21