1. Data Importation

df_train <- read.csv("train.csv")
df_test <- read.csv("test.csv")
glimpse(df_train)
## Rows: 891
## Columns: 12
## $ PassengerId <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17…
## $ Survived    <int> 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, …
## $ Pclass      <int> 3, 1, 3, 1, 3, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 2, 3, …
## $ Name        <fct> "Braund, Mr. Owen Harris", "Cumings, Mrs. John Bradley (F…
## $ Sex         <fct> male, female, female, female, male, male, male, male, fem…
## $ Age         <dbl> 22, 38, 26, 35, 35, NA, 54, 2, 27, 14, 4, 58, 20, 39, 14,…
## $ SibSp       <int> 1, 1, 0, 1, 0, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 0, 1, …
## $ Parch       <int> 0, 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, …
## $ Ticket      <fct> A/5 21171, PC 17599, STON/O2. 3101282, 113803, 373450, 33…
## $ Fare        <dbl> 7.2500, 71.2833, 7.9250, 53.1000, 8.0500, 8.4583, 51.8625…
## $ Cabin       <fct> , C85, , C123, , , E46, , , , G6, C103, , , , , , , , , ,…
## $ Embarked    <fct> S, C, S, S, S, Q, S, S, S, C, S, S, S, S, S, S, Q, S, S, …

2. Data Information

  1. Data Information
  1. Variable Predictor Sex, Age, SibSp, Parch, Fare, Embarked

  2. Variable Result : Survived

3. Data Wrangling

3.1 Data Wrangling into Data Train

#Check & Count NA Value
anyNA(df_train)
## [1] TRUE
summary(df_train)
##   PassengerId       Survived          Pclass     
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000  
##  Median :446.0   Median :0.0000   Median :3.000  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000  
##                                                  
##                                     Name         Sex           Age       
##  Abbing, Mr. Anthony                  :  1   female:314   Min.   : 0.42  
##  Abbott, Mr. Rossmore Edward          :  1   male  :577   1st Qu.:20.12  
##  Abbott, Mrs. Stanton (Rosa Hunt)     :  1                Median :28.00  
##  Abelson, Mr. Samuel                  :  1                Mean   :29.70  
##  Abelson, Mrs. Samuel (Hannah Wizosky):  1                3rd Qu.:38.00  
##  Adahl, Mr. Mauritz Nils Martin       :  1                Max.   :80.00  
##  (Other)                              :885                NA's   :177    
##      SibSp           Parch             Ticket         Fare       
##  Min.   :0.000   Min.   :0.0000   1601    :  7   Min.   :  0.00  
##  1st Qu.:0.000   1st Qu.:0.0000   347082  :  7   1st Qu.:  7.91  
##  Median :0.000   Median :0.0000   CA. 2343:  7   Median : 14.45  
##  Mean   :0.523   Mean   :0.3816   3101295 :  6   Mean   : 32.20  
##  3rd Qu.:1.000   3rd Qu.:0.0000   347088  :  6   3rd Qu.: 31.00  
##  Max.   :8.000   Max.   :6.0000   CA 2144 :  6   Max.   :512.33  
##                                   (Other) :852                   
##          Cabin     Embarked
##             :687    :  2   
##  B96 B98    :  4   C:168   
##  C23 C25 C27:  4   Q: 77   
##  G6         :  4   S:644   
##  C22 C26    :  3           
##  D          :  3           
##  (Other)    :186

Since NA Value only available on Age Variable, we should consider that is NA Value on age has majority/ minority portion of total row?

#Count NA Value
sum(is.na(df_train$Age))
## [1] 177
#Percentage of NA VAlues
sum(is.na(df_train$Age))/nrow(df_train) 
## [1] 0.1986532

19.86% of NA Values on Age is not dominant, we can filter that row that contain NA Value to be not involed as one of the predictor variable on the next process

Data Wrangling Steps :

  1. Omit unused variables : PassangerID, Name, Cabin, Ticket

  2. Convert Sex into number : 1 = male & 0 = female

  3. Convert Embarked into number : S = 1, C = 2, and Q = 3

  4. Round into 2 decimals on Fare

  5. Filter NA Value on Age

df_tr <- df_train %>% 
  select(-c(PassengerId, Name, Cabin, Ticket)) %>% 
  mutate(Sex = as.factor(ifelse(Sex == "male",1,0)),
         Embarked = as.factor(ifelse(Embarked == "S",1,ifelse(Embarked == "C",2,3))),
         Fare = round(Fare, 2)) %>% 
  filter(is.na(Age) == FALSE)

glimpse(df_tr)
## Rows: 714
## Columns: 8
## $ Survived <int> 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, …
## $ Pclass   <int> 3, 1, 3, 1, 3, 1, 3, 3, 2, 3, 1, 3, 3, 3, 2, 3, 3, 2, 2, 3, …
## $ Sex      <fct> 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0, …
## $ Age      <dbl> 22, 38, 26, 35, 35, 54, 2, 27, 14, 4, 58, 20, 39, 14, 55, 2,…
## $ SibSp    <int> 1, 1, 0, 1, 0, 0, 3, 0, 1, 1, 0, 0, 1, 0, 0, 4, 1, 0, 0, 0, …
## $ Parch    <int> 0, 0, 0, 0, 0, 0, 1, 2, 0, 1, 0, 0, 5, 0, 0, 1, 0, 0, 0, 0, …
## $ Fare     <dbl> 7.25, 71.28, 7.92, 53.10, 8.05, 51.86, 21.07, 11.13, 30.07, …
## $ Embarked <fct> 1, 2, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 3, …

4.2 Data Wrangling into Data Test

#Check NA Value
anyNA(df_test)
## [1] TRUE
summary(df_test)
##   PassengerId         Pclass     
##  Min.   : 892.0   Min.   :1.000  
##  1st Qu.: 996.2   1st Qu.:1.000  
##  Median :1100.5   Median :3.000  
##  Mean   :1100.5   Mean   :2.266  
##  3rd Qu.:1204.8   3rd Qu.:3.000  
##  Max.   :1309.0   Max.   :3.000  
##                                  
##                                         Name         Sex           Age       
##  Abbott, Master. Eugene Joseph            :  1   female:152   Min.   : 0.17  
##  Abelseth, Miss. Karen Marie              :  1   male  :266   1st Qu.:21.00  
##  Abelseth, Mr. Olaus Jorgensen            :  1                Median :27.00  
##  Abrahamsson, Mr. Abraham August Johannes :  1                Mean   :30.27  
##  Abrahim, Mrs. Joseph (Sophie Halaut Easu):  1                3rd Qu.:39.00  
##  Aks, Master. Philip Frank                :  1                Max.   :76.00  
##  (Other)                                  :412                NA's   :86     
##      SibSp            Parch             Ticket         Fare        
##  Min.   :0.0000   Min.   :0.0000   PC 17608:  5   Min.   :  0.000  
##  1st Qu.:0.0000   1st Qu.:0.0000   113503  :  4   1st Qu.:  7.896  
##  Median :0.0000   Median :0.0000   CA. 2343:  4   Median : 14.454  
##  Mean   :0.4474   Mean   :0.3923   16966   :  3   Mean   : 35.627  
##  3rd Qu.:1.0000   3rd Qu.:0.0000   220845  :  3   3rd Qu.: 31.500  
##  Max.   :8.0000   Max.   :9.0000   347077  :  3   Max.   :512.329  
##                                    (Other) :396   NA's   :1        
##              Cabin     Embarked
##                 :327   C:102   
##  B57 B59 B63 B66:  3   Q: 46   
##  A34            :  2   S:270   
##  B45            :  2           
##  C101           :  2           
##  C116           :  2           
##  (Other)        : 80

There are 2 Variables who contain NA Values: 1. Age : 86 NAs 2. Fare : 1 NA

#Percentage of NA Values
sum(is.na(df_test$Age))/ nrow(df_test)
## [1] 0.2057416

20.57% of NA Values on Age is not dominant, we can filter that row that contain NA Value to be not involed as one of the predictor variable on the next process

#Check Data Structure
glimpse(df_test)
## Rows: 418
## Columns: 11
## $ PassengerId <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, 902, 90…
## $ Pclass      <int> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 3, 1, 1, 2, 1, 2, 2, 3, 3, …
## $ Name        <fct> "Kelly, Mr. James", "Wilkes, Mrs. James (Ellen Needs)", "…
## $ Sex         <fct> male, female, male, male, female, male, female, male, fem…
## $ Age         <dbl> 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0, 18.0, 21.…
## $ SibSp       <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 0, 1, 1, 1, 1, 0, 0, 1, …
## $ Parch       <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Ticket      <fct> 330911, 363272, 240276, 315154, 3101298, 7538, 330972, 24…
## $ Fare        <dbl> 7.8292, 7.0000, 9.6875, 8.6625, 12.2875, 9.2250, 7.6292, …
## $ Cabin       <fct> , , , , , , , , , , , , B45, , E31, , , , , , , , , , B57…
## $ Embarked    <fct> Q, S, Q, S, S, S, Q, S, C, S, S, S, S, S, S, C, Q, C, S, …

Data Wrangling Steps : ~1-4 Similiar with data wrangling that has been implemented into df_train

  1. Omit unused variables : PassangerID, Name, Cabin, Ticket

  2. Convert Sex into number : 1 = male & 0 = female

  3. Convert Embarked into number : S = 1, C = 2, and Q = 3

  4. Round into 2 decimals on Fare

  5. Filter NA Value on Age & Parch

df_ts <- df_test %>% 
  select(-c(PassengerId, Name, Cabin, Ticket)) %>% 
    mutate(Sex = as.numeric(ifelse(Sex == "male",1,0)),
         Embarked = as.numeric(ifelse(Embarked == "S",1,ifelse(Embarked == "C",2,3))),
         Fare = round(Fare, 2)) %>% 
  filter(is.na(Age) == FALSE,
         is.na(Fare) == FALSE)

glimpse(df_ts)
## Rows: 331
## Columns: 7
## $ Pclass   <int> 3, 3, 2, 3, 3, 3, 3, 2, 3, 3, 1, 1, 2, 1, 2, 2, 3, 3, 3, 1, …
## $ Sex      <dbl> 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, …
## $ Age      <dbl> 34.5, 47.0, 62.0, 27.0, 22.0, 14.0, 30.0, 26.0, 18.0, 21.0, …
## $ SibSp    <int> 0, 1, 0, 0, 1, 0, 0, 1, 0, 2, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, …
## $ Parch    <int> 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, …
## $ Fare     <dbl> 7.83, 7.00, 9.69, 8.66, 12.29, 9.22, 7.63, 29.00, 7.23, 24.1…
## $ Embarked <dbl> 3, 1, 3, 1, 1, 1, 3, 1, 2, 1, 1, 1, 1, 1, 2, 3, 2, 1, 2, 2, …

5. Machine Learning Development

5.1 Logistic Regression

a. Feature Selection X & Y Data Train

# x data train
log_train_x <- df_tr %>% 
  select(-Survived) %>% 
  mutate(Sex = as.numeric(Sex),
         Embarked = as.numeric(Embarked))

#scaling X feature
log_train_x <- scale(log_train_x)

# scaling data test predictor variables - since harus in numeric values
log_test_x <- scale(df_ts,
                center = attr(log_train_x, "scaled:center"),
                scale = attr(log_train_x, "scaled:scale"))

log_test_x <- as.data.frame(log_test_x)
# Convert into Data Frame - Merge x and y data train
log_train_x <- as.data.frame(log_train_x)

# y data train
log_train_y <- df_tr %>% 
  select(Survived)

log_train_x <- bind_cols(log_train_x,log_train_y)

b. Model Development & Result

model_log <- glm(formula = Survived ~ .,
                 family = binomial,
                 data=log_train_x)
#penggunaan metode backward
model_log <- step(object = model_log,
                 direction = "backward")
## Start:  AIC=651.49
## Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked
## 
##            Df Deviance    AIC
## - Parch     1   635.73 649.73
## - Embarked  1   635.81 649.81
## - Fare      1   636.12 650.12
## <none>          635.49 651.49
## - SibSp     1   644.69 658.69
## - Age       1   667.05 681.05
## - Pclass    1   694.91 708.91
## - Sex       1   811.70 825.70
## 
## Step:  AIC=649.73
## Survived ~ Pclass + Sex + Age + SibSp + Fare + Embarked
## 
##            Df Deviance    AIC
## - Embarked  1   636.07 648.07
## - Fare      1   636.22 648.22
## <none>          635.73 649.73
## - SibSp     1   646.54 658.54
## - Age       1   667.25 679.25
## - Pclass    1   698.71 710.71
## - Sex       1   817.02 829.02
## 
## Step:  AIC=648.07
## Survived ~ Pclass + Sex + Age + SibSp + Fare
## 
##          Df Deviance    AIC
## - Fare    1   636.72 646.72
## <none>        636.07 648.07
## - SibSp   1   647.23 657.23
## - Age     1   667.86 677.86
## - Pclass  1   699.21 709.21
## - Sex     1   820.07 830.07
## 
## Step:  AIC=646.72
## Survived ~ Pclass + Sex + Age + SibSp
## 
##          Df Deviance    AIC
## <none>        636.72 646.72
## - SibSp   1   647.29 655.29
## - Age     1   669.44 677.44
## - Pclass  1   742.29 750.29
## - Sex     1   823.84 831.84
summary(model_log)
## 
## Call:
## glm(formula = Survived ~ Pclass + Sex + Age + SibSp, family = binomial, 
##     data = log_train_x)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7714  -0.6445  -0.3836   0.6276   2.4585  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -0.5212     0.1002  -5.203 1.97e-07 ***
## Pclass       -1.1043     0.1181  -9.350  < 2e-16 ***
## Sex          -1.2643     0.1034 -12.229  < 2e-16 ***
## Age          -0.6448     0.1185  -5.442 5.26e-08 ***
## SibSp        -0.3497     0.1126  -3.106  0.00189 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 964.52  on 713  degrees of freedom
## Residual deviance: 636.72  on 709  degrees of freedom
## AIC: 646.72
## 
## Number of Fisher Scoring iterations: 5

c. Predict into data test with using threshold >= 0.5

log_test_x$Survived <- predict(object = model_log,
                                 newdata = log_test_x,
                                 type = "response")
log_test_x$Survived <- as.factor(ifelse(log_test_x$Survived >= 0.5, 1, 0))  
summary(log_test_x$Survived)
##   0   1 
##  18 313

d. Conclusion

** Utilizing Logistic Regression (Backward) with Threshold >= 0.5, predict 313 victims are survived from the prediction **

5.2 KNN

a. Feature Selection X & Y Data Train

#Determine k values
round(sqrt(nrow(df_tr)))
## [1] 27
knn_x_train <- df_tr %>% 
  select(-Survived)

knn_x_test <- df_ts

b. Model Development, Prediction, & Result

model_knn <- knn(train = knn_x_train, 
                 test = knn_x_test, 
                 cl = df_tr$Survived, k = 27)

summary(model_knn)
##   0   1 
## 214 117

c. Conclusion

** Utilizing KNN, predict 117 victims are survived from the prediction **