This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.
Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Ctrl+Shift+Enter.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
url <- 'titantic.txt'
df <- read.table(url, header = TRUE, sep = '\t')
glimpse(df)
## Rows: 1,313
## Columns: 5
## $ Name <chr> "Allen, Miss Elisabeth Walton", "Allison, Miss Helen Loraine"~
## $ PClass <chr> "1st", "1st", "1st", "1st", "1st", "1st", "1st", "1st", "1st"~
## $ Age <dbl> 29.00, 2.00, 30.00, 25.00, 0.92, 47.00, 63.00, 39.00, 58.00, ~
## $ Sex <chr> "female", "female", "male", "female", "male", "male", "female~
## $ Survived <int> 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1~
df
Let’s drop age where it’s NA
new_df<- df %>% drop_na()
# new_df<- df
new_df <- subset(new_df, select = -c(Name))
new_df$PClass <- as.factor(new_df$PClass)
new_df$Sex <- as.factor(new_df$Sex)
new_df$Survived <- as.factor(new_df$Survived)
new_df
continuous <-select_if(new_df, is.numeric)
summary(continuous)
## Age
## Min. : 0.17
## 1st Qu.:21.00
## Median :28.00
## Mean :30.40
## 3rd Qu.:39.00
## Max. :71.00
Plot distribution of age
library(ggplot2)
ggplot(continuous, aes(x = Age)) +
geom_density(alpha = .2, fill = "#FF6666")
Check factor variables
This step has two objectives:
Check the level in each categorical column
Define new levels
We will divide this step into three parts:
Select the categorical columns
Store the bar chart of each column in a list
Print the graphs
factor <- data.frame(select_if(new_df, is.factor))
ncol(factor)
## [1] 3
library(ggplot2)
# Create graph for each column
graph <- lapply(names(factor),
function(x)
ggplot(factor, aes(get(x))) +
geom_bar() +
theme(axis.text.x = element_text(angle = 90)))
graph
## [[1]]
##
## [[2]]
##
## [[3]]
ggplot(new_df, aes(x = PClass, fill = Survived)) +
geom_bar(position = "fill") +
theme_classic()
ggplot(new_df, aes(x = Sex, fill = Survived)) +
geom_bar(position = "fill") +
theme_classic()
ggplot(new_df, aes(x = Sex, fill = Survived)) +
geom_bar(position = "fill") +
theme_classic()
Sex to Age
# box plot gender working time
ggplot(new_df, aes(x = Sex, y = Age)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.
# box plot gender working time
ggplot(new_df, aes(x = Survived, y = Age)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.
# box plot gender working time
ggplot(new_df, aes(x = PClass, y = Age)) +
geom_boxplot() +
stat_summary(fun.y = mean,
geom = "point",
size = 3,
color = "steelblue") +
theme_classic()
## Warning: `fun.y` is deprecated. Use `fun` instead.
Do an anova test
anova <- aov(Age~Survived, new_df)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## Survived 1 576 576.0 2.84 0.0924 .
## Residuals 754 152931 202.8
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova <- aov(Age~PClass, new_df)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## PClass 2 28917 14458 87.38 <2e-16 ***
## Residuals 753 124590 165
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
anova <- aov(Age~PClass+Survived, new_df)
summary(anova)
## Df Sum Sq Mean Sq F value Pr(>F)
## PClass 2 28917 14458 91.80 < 2e-16 ***
## Survived 1 6154 6154 39.07 6.84e-10 ***
## Residuals 752 118437 157
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Check for non-linearity
library(ggplot2)
ggplot(new_df, aes(x = PClass, y = Age)) +
geom_point(aes(color = Survived),
size = 0.5) +
stat_smooth(method = 'lm',
formula = y~poly(x, 2),
se = TRUE,
aes(color = Survived)) +
theme_classic()
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Convert data to numeric
corr <- data.frame(lapply(new_df, as.integer))
# Plot the graph
ggcorr(corr,
method = c("pairwise", "spearman"),
nbreaks = 6,
hjust = 0.8,
label = TRUE,
label_size = 3,
color = "grey50")
Step 5: Train/Test Set
set.seed(123)
create_train_test <- function(data, size = 0.8, train = TRUE) {
n_row = nrow(data)
total_row = size * n_row
train_sample <- 1: total_row
if (train == TRUE) {
return (data[train_sample, ])
} else {
return (data[-train_sample, ])
}
}
data_train <- create_train_test(new_df, 0.8, train = TRUE)
data_test <- create_train_test(new_df, 0.8, train = FALSE)
dim(data_train)
## [1] 604 4
dim(data_test)
## [1] 152 4
Step 6: BUild the model - Generalized Linear Model
formula <- Survived~.
logit <- glm(formula, data = data_train, family = 'binomial')
summary(logit)
##
## Call:
## glm(formula = formula, family = "binomial", data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9991 -0.6511 -0.3312 0.5784 2.6462
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.58531 0.48468 9.461 < 2e-16 ***
## PClass2nd -1.50959 0.28551 -5.287 1.24e-07 ***
## PClass3rd -2.67201 0.34685 -7.704 1.32e-14 ***
## Age -0.04960 0.00897 -5.529 3.22e-08 ***
## Sexmale -3.15209 0.24776 -12.723 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 833.50 on 603 degrees of freedom
## Residual deviance: 518.56 on 599 degrees of freedom
## AIC: 528.56
##
## Number of Fisher Scoring iterations: 5
logit$aic
## [1] 528.5567
logit
##
## Call: glm(formula = formula, family = "binomial", data = data_train)
##
## Coefficients:
## (Intercept) PClass2nd PClass3rd Age Sexmale
## 4.5853 -1.5096 -2.6720 -0.0496 -3.1521
##
## Degrees of Freedom: 603 Total (i.e. Null); 599 Residual
## Null Deviance: 833.5
## Residual Deviance: 518.6 AIC: 528.6
Step 7: Assess the performance of the model
predict <- predict(logit, data_test, type = 'response')
# confusion matrix
table_mat <- table(data_test$Survived, predict >= 0.5)
table_mat
##
## FALSE TRUE
## 0 87 30
## 1 18 17
Accuracy of Binomial Model
accuracy_Test <- sum(diag(table_mat)) / sum(table_mat)
accuracy_Test
## [1] 0.6842105
precision <- function(matrix) {
# True positive
tp <- matrix[2, 2]
# false positive
fp <- matrix[1, 2]
return (tp / (tp + fp))
}
recall <- function(matrix) {
# true positive
tp <- matrix[2, 2]# false positive
fn <- matrix[2, 1]
return (tp / (tp + fn))
}
prec <- precision(table_mat)
prec
## [1] 0.3617021
rec <- recall(table_mat)
rec
## [1] 0.4857143
f1 <- 2 * ((prec * rec) / (prec + rec))
f1
## [1] 0.4146341
# install.packages("ROCR")
library(ROCR)
ROCRpred <- prediction(predict, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
formula_2 <- Survived~Age+Sex
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
summary(logit_2)
##
## Call:
## glm(formula = formula_2, family = "binomial", data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1041 -0.7229 -0.5919 0.6105 1.9353
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.127908 0.294856 7.217 5.32e-13 ***
## Age -0.015070 0.007106 -2.121 0.0339 *
## Sexmale -2.929549 0.219122 -13.370 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 833.5 on 603 degrees of freedom
## Residual deviance: 591.8 on 601 degrees of freedom
## AIC: 597.8
##
## Number of Fisher Scoring iterations: 4
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$Survived, predict_2 > 0.5)
table_mat_2
##
## FALSE TRUE
## 0 84 33
## 1 15 20
precision_2 <- precision(table_mat_2)
precision_2
## [1] 0.3773585
recall_2 <- recall(table_mat_2)
recall_2
## [1] 0.5714286
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
## [1] 0.4545455
accuracy_Test <- sum(diag(table_mat_2)) / sum(table_mat_2)
accuracy_Test
## [1] 0.6842105
anova(logit_2)
ROCRpred <- prediction(predict_2, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
formula_2 <- Survived~Age
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
summary(logit_2)
##
## Call:
## glm(formula = formula_2, family = "binomial", data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2722 -1.1278 -0.9846 1.2239 1.4489
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.22409 0.19387 1.156 0.2477
## Age -0.01221 0.00562 -2.173 0.0298 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 833.50 on 603 degrees of freedom
## Residual deviance: 828.72 on 602 degrees of freedom
## AIC: 832.72
##
## Number of Fisher Scoring iterations: 4
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$Survived, predict_2 > 0.5)
table_mat_2
##
## FALSE TRUE
## 0 95 22
## 1 26 9
precision_2 <- precision(table_mat_2)
precision_2
## [1] 0.2903226
recall_2 <- recall(table_mat_2)
recall_2
## [1] 0.2571429
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
## [1] 0.2727273
accuracy_Test <- sum(diag(table_mat_2)) / sum(table_mat_2)
accuracy_Test
## [1] 0.6842105
anova(logit_2)
ROCRpred <- prediction(predict_2, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))
str(data_train)
## 'data.frame': 604 obs. of 4 variables:
## $ PClass : Factor w/ 3 levels "1st","2nd","3rd": 1 1 1 1 1 1 1 1 1 1 ...
## $ Age : num 29 2 30 25 0.92 47 63 39 58 71 ...
## $ Sex : Factor w/ 2 levels "female","male": 1 1 2 1 2 2 1 2 1 2 ...
## $ Survived: Factor w/ 2 levels "0","1": 2 1 1 1 2 2 2 1 2 1 ...
formula_2 <- Survived~PClass+Sex
logit_2 <- glm(formula_2, data = data_train, family = 'binomial')
summary(logit_2)
##
## Call:
## glm(formula = formula_2, family = "binomial", data = data_train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.2657 -0.6614 -0.4291 0.5989 2.2050
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.4868 0.2511 9.902 < 2e-16 ***
## PClass2nd -0.8592 0.2465 -3.485 0.000491 ***
## PClass3rd -1.7897 0.2886 -6.200 5.64e-10 ***
## Sexmale -3.0360 0.2347 -12.937 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 833.50 on 603 degrees of freedom
## Residual deviance: 552.68 on 600 degrees of freedom
## AIC: 560.68
##
## Number of Fisher Scoring iterations: 4
predict_2 <- predict(logit_2, data_test, type = 'response')
table_mat_2 <- table(data_test$Survived, predict_2 > 0.5)
table_mat_2
##
## FALSE TRUE
## 0 84 33
## 1 15 20
precision_2 <- precision(table_mat_2)
precision_2
## [1] 0.3773585
recall_2 <- recall(table_mat_2)
recall_2
## [1] 0.5714286
f1_2 <- 2 * ((precision_2 * recall_2) / (precision_2 + recall_2))
f1_2
## [1] 0.4545455
accuracy_Test <- sum(diag(table_mat_2)) / sum(table_mat_2)
accuracy_Test
## [1] 0.6842105
anova(logit_2)
ROCRpred <- prediction(predict_2, data_test$Survived)
ROCRperf <- performance(ROCRpred, 'tpr', 'fpr')
plot(ROCRperf, colorize = TRUE, text.adj = c(-0.2, 1.7))