This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
url <- 'titantic.txt'
df <- read.table(url, header = TRUE, sep = '\t')
glimpse(df)
## Rows: 1,313
## Columns: 5
## $ Name     <chr> "Allen, Miss Elisabeth Walton", "Allison, Miss Helen Loraine"~
## $ PClass   <chr> "1st", "1st", "1st", "1st", "1st", "1st", "1st", "1st", "1st"~
## $ Age      <dbl> 29.00, 2.00, 30.00, 25.00, 0.92, 47.00, 63.00, 39.00, 58.00, ~
## $ Sex      <chr> "female", "female", "male", "female", "male", "male", "female~
## $ Survived <int> 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1~
df
  1. Check continuous variables

Let’s drop age where it’s NA

new_df<- df %>% drop_na()
# new_df<- df
new_df <- subset(new_df, select = -c(Name))
new_df$PClass <- as.factor(new_df$PClass)
new_df$Sex <- as.factor(new_df$Sex)
new_df$Survived <- as.factor(new_df$Survived)
new_df
continuous <-select_if(new_df, is.numeric)
summary(continuous)
##       Age       
##  Min.   : 0.17  
##  1st Qu.:21.00  
##  Median :28.00  
##  Mean   :30.40  
##  3rd Qu.:39.00  
##  Max.   :71.00

Plot distribution of age

library(ggplot2)
ggplot(continuous, aes(x = Age)) +
    geom_density(alpha = .2, fill = "#FF6666")

  1. Check factor variables

    This step has two objectives:

    • Check the level in each categorical column

    • Define new levels

    We will divide this step into three parts:

    • Select the categorical columns

    • Store the bar chart of each column in a list

    • Print the graphs

factor <- data.frame(select_if(new_df, is.factor))
ncol(factor)
## [1] 3
library(ggplot2)
# Create graph for each column
graph <- lapply(names(factor),
    function(x) 
    ggplot(factor, aes(get(x))) +
        geom_bar() +
        theme(axis.text.x = element_text(angle = 90)))
graph
## [[1]]

## 
## [[2]]

## 
## [[3]]

  1. Feature Engineering (No need as variables are defined well)
  2. Summary statistic
ggplot(new_df, aes(x = PClass, fill = Survived)) +
    geom_bar(position = "fill") +
    theme_classic()

ggplot(new_df, aes(x = Sex, fill = Survived)) +
    geom_bar(position = "fill") +
    theme_classic()

ggplot(new_df, aes(x = Sex, fill = Survived)) +
    geom_bar(position = "fill") +
    theme_classic()

Sex to Age

# box plot gender working time
ggplot(new_df, aes(x = Sex, y = Age)) +
    geom_boxplot() +
    stat_summary(fun.y = mean,
        geom = "point",
        size = 3,
        color = "steelblue") +
    theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.

# box plot gender working time
ggplot(new_df, aes(x = Survived, y = Age)) +
    geom_boxplot() +
    stat_summary(fun.y = mean,
        geom = "point",
        size = 3,
        color = "steelblue") +
    theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.

# box plot gender working time
ggplot(new_df, aes(x = PClass, y = Age)) +
    geom_boxplot() +
    stat_summary(fun.y = mean,
        geom = "point",
        size = 3,
        color = "steelblue") +
    theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.

Do an anova test

anova <- aov(Age~Survived, new_df)
summary(anova)
##              Df Sum Sq Mean Sq F value Pr(>F)  
## Survived      1    576   576.0    2.84 0.0924 .
## Residuals   754 152931   202.8                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova <- aov(Age~PClass, new_df)
summary(anova)
##              Df Sum Sq Mean Sq F value Pr(>F)    
## PClass        2  28917   14458   87.38 <2e-16 ***
## Residuals   753 124590     165                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova <- aov(Age~PClass+Survived, new_df)
summary(anova)
##              Df Sum Sq Mean Sq F value   Pr(>F)    
## PClass        2  28917   14458   91.80  < 2e-16 ***
## Survived      1   6154    6154   39.07 6.84e-10 ***
## Residuals   752 118437     157                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Check for non-linearity

library(ggplot2)
ggplot(new_df, aes(x = PClass, y = Age)) +
    geom_point(aes(color = Survived),
        size = 0.5) +
    stat_smooth(method = 'lm',
        formula = y~poly(x, 2),
        se = TRUE,
        aes(color = Survived)) +
    theme_classic()

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
# Convert data to numeric
corr <- data.frame(lapply(new_df, as.integer))
# Plot the graph
ggcorr(corr,
    method = c("pairwise", "spearman"),
    nbreaks = 6,
    hjust = 0.8,
    label = TRUE,
    label_size = 3,
    color = "grey50")

Step 5: Train/Test Set

set.seed(123)
create_train_test <- function(data, size = 0.8, train = TRUE) {
    n_row = nrow(data)
    total_row = size * n_row
    train_sample <- 1: total_row
    if (train == TRUE) {
        return (data[train_sample, ])
    } else {
        return (data[-train_sample, ])
    }
}
data_train <- create_train_test(new_df, 0.8, train = TRUE)
data_test <- create_train_test(new_df, 0.8, train = FALSE)
dim(data_train)
## [1] 604   4
dim(data_test)
## [1] 152   4

Step 6: BUild the model - Generalized Linear Model

Model 1

formula <- Survived~.
logit <- glm(formula, data = data_train, family = 'binomial')
summary(logit)
## 
## Call:
## glm(formula = formula, family = "binomial", data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9991  -0.6511  -0.3312   0.5784   2.6462  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  4.58531    0.48468   9.461  < 2e-16 ***
## PClass2nd   -1.50959    0.28551  -5.287 1.24e-07 ***
## PClass3rd   -2.67201    0.34685  -7.704 1.32e-14 ***
## Age         -0.04960    0.00897  -5.529 3.22e-08 ***
## Sexmale     -3.15209    0.24776 -12.723  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 833.50  on 603  degrees of freedom
## Residual deviance: 518.56  on 599  degrees of freedom
## AIC: 528.56
## 
## Number of Fisher Scoring iterations: 5
logit$aic
## [1] 528.5567
logit
## 
## Call:  glm(formula = formula, family = "binomial", data = data_train)
## 
## Coefficients:
## (Intercept)    PClass2nd    PClass3rd          Age      Sexmale  
##      4.5853      -1.5096      -2.6720      -0.0496      -3.1521  
## 
## Degrees of Freedom: 603 Total (i.e. Null);  599 Residual
## Null Deviance:       833.5 
## Residual Deviance: 518.6     AIC: 528.6

Step 7: Assess the performance of the model

predict <- predict(logit, data_test, type = 'response')
# confusion matrix
table_mat <- table(data_test$Survived, predict >= 0.5)
table_mat
##    
##     FALSE TRUE
##   0    87   30
##   1    18   17

Accuracy of Binomial Model

accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.6842105
precision <- function(matrix) {
    # True positive
    tp <- matrix[2, 2]
    # false positive
    fp <- matrix[1, 2]
    return (tp / (tp + fp))
}
recall <- function(matrix) {
# true positive
    tp <- matrix[2, 2]# false positive
    fn <- matrix[2, 1]
    return (tp / (tp + fn))
}
prec <- precision(table_mat)
prec
## [1] 0.3617021
rec <- recall(table_mat)
rec
## [1] 0.4857143
f1 <- 2 * ((prec * rec) / (prec + rec))
f1
## [1] 0.4146341
# install.packages("ROCR")
library(ROCR)
ROCRpred <- prediction(predict, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))

Model 2

formula_2 <- Survived~Age+Sex
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
summary(logit_2)
## 
## Call:
## glm(formula = formula_2, family = "binomial", data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1041  -0.7229  -0.5919   0.6105   1.9353  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  2.127908   0.294856   7.217 5.32e-13 ***
## Age         -0.015070   0.007106  -2.121   0.0339 *  
## Sexmale     -2.929549   0.219122 -13.370  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 833.5  on 603  degrees of freedom
## Residual deviance: 591.8  on 601  degrees of freedom
## AIC: 597.8
## 
## Number of Fisher Scoring iterations: 4
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$Survived, predict_2 > 0.5)
table_mat_2
##    
##     FALSE TRUE
##   0    84   33
##   1    15   20
precision_2 <- precision(table_mat_2)
precision_2
## [1] 0.3773585
recall_2 <- recall(table_mat_2)
recall_2
## [1] 0.5714286
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
## [1] 0.4545455
accuracy_Test <- sum(diag(table_mat_2)) / sum(table_mat_2)
accuracy_Test
## [1] 0.6842105
anova(logit_2)
ROCRpred <- prediction(predict_2, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))

formula_2 <- Survived~Age
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
summary(logit_2)
## 
## Call:
## glm(formula = formula_2, family = "binomial", data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2722  -1.1278  -0.9846   1.2239   1.4489  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)  
## (Intercept)  0.22409    0.19387   1.156   0.2477  
## Age         -0.01221    0.00562  -2.173   0.0298 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 833.50  on 603  degrees of freedom
## Residual deviance: 828.72  on 602  degrees of freedom
## AIC: 832.72
## 
## Number of Fisher Scoring iterations: 4
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$Survived, predict_2 > 0.5)
table_mat_2
##    
##     FALSE TRUE
##   0    95   22
##   1    26    9
precision_2 <- precision(table_mat_2)
precision_2
## [1] 0.2903226
recall_2 <- recall(table_mat_2)
recall_2
## [1] 0.2571429
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
## [1] 0.2727273
accuracy_Test <- sum(diag(table_mat_2)) / sum(table_mat_2)
accuracy_Test
## [1] 0.6842105
anova(logit_2)
ROCRpred <- prediction(predict_2, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))

str(data_train)
## 'data.frame':    604 obs. of  4 variables:
##  $ PClass  : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
##  $ Age     : num  29 2 30 25 0.92 47 63 39 58 71 ...
##  $ Sex     : Factor w/ 2 levels "female","male": 1 1 2 1 2 2 1 2 1 2 ...
##  $ Survived: Factor w/ 2 levels "0","1": 2 1 1 1 2 2 2 1 2 1 ...

Model 3

formula_2 <- Survived~PClass+Sex
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
summary(logit_2)
## 
## Call:
## glm(formula = formula_2, family = "binomial", data = data_train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.2657  -0.6614  -0.4291   0.5989   2.2050  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.4868     0.2511   9.902  < 2e-16 ***
## PClass2nd    -0.8592     0.2465  -3.485 0.000491 ***
## PClass3rd    -1.7897     0.2886  -6.200 5.64e-10 ***
## Sexmale      -3.0360     0.2347 -12.937  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 833.50  on 603  degrees of freedom
## Residual deviance: 552.68  on 600  degrees of freedom
## AIC: 560.68
## 
## Number of Fisher Scoring iterations: 4
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$Survived, predict_2 > 0.5)
table_mat_2
##    
##     FALSE TRUE
##   0    84   33
##   1    15   20
precision_2 <- precision(table_mat_2)
precision_2
## [1] 0.3773585
recall_2 <- recall(table_mat_2)
recall_2
## [1] 0.5714286
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
## [1] 0.4545455
accuracy_Test <- sum(diag(table_mat_2)) / sum(table_mat_2)
accuracy_Test
## [1] 0.6842105
anova(logit_2)
ROCRpred <- prediction(predict_2, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))