- Defining the problem statement (Intrduction)
- Collecting the data (Source)
- Preprocessing the data (Data wrangling)
- Exploratory data analysis (EDA)
- Feature engineering (Bining, Dummy)
- Modelling (Train, Validation)
- Testing
Complete the analysis of what sorts of people were likely to survive.
In particular, we ask you to apply the tools of machine learning to predict which passengers survived the Titanic tragedy.
training data set and testing data set are given by Kaggle you can download from kaggle directly kaggle
- Load libraries
# data wrangling
suppressMessages(library(tidyverse))
suppressMessages(library(forcats))
suppressMessages(library(stringr))
suppressMessages(library(caTools))
suppressMessages(library(stringr))
# data assessment/visualizations
suppressMessages(library(DT))
suppressMessages(library(data.table))
suppressMessages(library(pander))
suppressMessages(library(ggplot2))
suppressMessages(library(scales))
suppressMessages(library(grid))
suppressMessages(library(gridExtra))
suppressMessages(library(corrplot))
suppressMessages(library(VIM))
suppressMessages(library(knitr))
suppressMessages(library(vcd))
suppressMessages(library(caret))
# model
suppressMessages(library(xgboost))
suppressMessages(library(MLmetrics))
suppressMessages(library(randomForest))
suppressMessages(library(rpart))
suppressMessages(library(rpart.plot))
suppressMessages(library(car))
suppressMessages(library(e1071))
suppressMessages(library(vcd))
suppressMessages(library(ROCR))
suppressMessages(library(pROC))
suppressMessages(library(VIM))
suppressMessages(library(glmnet))
suppressMessages(library(gmodels))
suppressMessages(library(rattle)) #decision tree fancy tree
suppressMessages(library(rpart.plot)) #decision tree fancy tree
suppressMessages(library(scales))
suppressMessages(library(readr))
suppressMessages(library(sqldf))
suppressMessages(library(kableExtra))
suppressMessages(library(ggthemes))
- Load libraries
train <- read_csv('./input/train.csv')
test <- read_csv('./input/test.csv')
- labeling
train$set <- "train"
test$set <- "test"
test$Survived <- NA
full <- rbind(train, test)
names(full) <- tolower(names(full))
- Data wranling
1. 데이터 분포와 형태 확인 2. 컬럼명 및 특성 확인 (unique한 값) 3. 결측치 (및 이상치) 확인
- Check missing values
# check data
str(full)
# Unique values per column
lapply(full, function(x) length(unique(x)))
#Check for Missing values
missing_values <- full %>% summarize_all(funs(sum(is.na(.))/n()))
missing_values <- gather(missing_values, key="feature", value="missing_pct")
missing_values %>%
ggplot(aes(x=reorder(feature,-missing_pct),y=missing_pct)) +
geom_bar(stat="identity",fill="red")+
coord_flip()+theme_bw()
#Useful data quality function for missing values
check_column = function(df,colname){
test_data = df[[colname]]
numMissing = max(sum(is.na(test_data)|is.nan(test_data)|test_data==''),0)
if (class(test_data) == 'numeric' | class(test_data) == 'Date' | class(test_data) == 'difftime' | class(test_data) == 'integer'){
list('col' = colname,'class' = class(test_data), 'num' = length(test_data) - numMissing, 'numMissing' = numMissing, 'numInfinite' = sum(is.infinite(test_data)), 'avgVal' = mean(test_data,na.rm=TRUE), 'minVal' = round(min(test_data,na.rm = TRUE)), 'maxVal' = round(max(test_data,na.rm = TRUE)))
} else{
list('col' = colname,'class' = class(test_data), 'num' = length(test_data) - numMissing, 'numMissing' = numMissing, 'numInfinite' = NA, 'avgVal' = NA, 'minVal' = NA, 'maxVal' = NA)
}
}
check_all_cols = function(df){
resDF = data.frame()
for (colName in names(df)){
resDF = rbind(resDF,as.data.frame(check_column(df=df,colname=colName)))
}
resDF
}
datatable(check_all_cols(full), style="bootstrap", class="table-condensed", options = list(dom = 'tp',scrollX = TRUE))
miss_pct <- map_dbl(full, function(x) { round((sum(is.na(x)) / length(x)) * 100, 1) })
miss_pct <- miss_pct[miss_pct > 0]
data.frame(miss=miss_pct, var=names(miss_pct), row.names=NULL) %>%
ggplot(aes(x=reorder(var, -miss), y=miss)) +
geom_bar(stat='identity', fill='red') +
labs(x='', y='% missing', title='Percent missing data by feature') +
theme(axis.text.x=element_text(angle=90, hjust=1)) +
coord_flip()+theme_bw()
colSums(is.na(full))
## passengerid survived pclass name sex age
## 0 418 0 0 0 263
## sibsp parch ticket fare cabin embarked
## 0 0 0 1 1014 2
## set
## 0
- Summary : missing values
총 1309개(Train 891 rows, Test 418 rows)의 데이터 중 "Age" 변수는 1,046개, "Cabin" 변수는 295개만 존재한다.
이렇게 유실된 정보(결측치)들을 머신러닝 모델에 넣으면 잘못된 정보를 야기한다.
Modeling 전에 missing field는 feature engneering (preprocessing)을 통해 의미있는 정보로 바꿔주어야 한다.
테스트 데이터에도 비슷한 비율로 "Age" 변수와 "Cabin" 변수가 빠져 있다.
full <- full %>%
mutate(survived = case_when(survived==1 ~ "Yes",
survived==0 ~ "No"))
crude_summary <- full %>%
filter(set=="train") %>%
select(passengerid, survived) %>%
group_by(survived) %>%
summarise(cnt = n()) %>%
mutate(prob = cnt / sum(cnt))
crude_survrate <- crude_summary$prob[crude_summary$survived=="Yes"]
kable(crude_summary, caption="2x2 Contingency Table on Survival.", format="markdown")
| survived | cnt | prob |
|---|---|---|
| No | 549 | 0.6161616 |
| Yes | 342 | 0.3838384 |
#full['Title'] <- lapply(full['Name'], function(x) str_extract(x, " (([A-Za-z]+))"))
##full$Title <- ifelse(full$Title == " Mr", 0, ifelse(full$Title == " Miss", 1, ifelse(full$Title == " Mrs", 2, 3)))
##full$Title <- as.factor(full$Title)
#full$Title[full$Title == ' Mr'] <- 'Mr'
#full$Title[full$Title == ' Miss'] <- 'Miss'
#full$Title[full$Title == ' Mrs'] <- 'Mrs'
#full$Title[full$Title == ' Others'] <- 'Others'
names <- full$name
title <- gsub("^.*, (.*?)\\..*$", "\\1", names)
full$title <- title
table(title)
## title
## Capt Col Don Dona Dr
## 1 4 1 1 8
## Jonkheer Lady Major Master Miss
## 1 1 2 61 260
## Mlle Mme Mr Mrs Ms
## 2 1 757 197 2
## Rev Sir the Countess
## 8 1 1
## MISS, Mrs, Master and Mr are taking more numbers
## Better to group Other titles into bigger baske by checking gender and survival rate to aviod any overfitting
full$title[full$title == 'Mlle'] <- 'Miss'
full$title[full$title == 'Ms'] <- 'Miss'
full$title[full$title == 'Mme'] <- 'Mrs'
full$title[full$title == 'Lady'] <- 'Miss'
full$title[full$title == 'Dona'] <- 'Miss'
## I am afraid creating a new varible with small data can causes a overfit
## However, My thinking is that combining below feauter into original variable may loss some predictive power as they are all army folks, doctor and nobel peoples
full$title[full$title == 'Capt'] <- 'Officer'
full$title[full$title == 'Col'] <- 'Officer'
full$title[full$title == 'Major'] <- 'Officer'
full$title[full$title == 'Dr'] <- 'Officer'
full$title[full$title == 'Rev'] <- 'Officer'
full$title[full$title == 'Don'] <- 'Officer'
full$title[full$title == 'Sir'] <- 'Officer'
full$title[full$title == 'the Countess'] <- 'Officer'
full$title[full$title == 'Jonkheer'] <- 'Officer'
# SQL query
#task1 <-sqldf("
#select
# title
# , survived
# , count(*) as freq
#from full
#where survived not null
#group by 1, 2
#")
#task1
crude_summary <- full %>%
filter(set=="train") %>%
select(title, survived) %>%
group_by(title, survived) %>%
summarise(cnt = n()) %>%
mutate(prob = cnt / sum(cnt))
kable(crude_summary, caption="Survival by title.", format="markdown")
| title | survived | cnt | prob |
|---|---|---|---|
| Master | No | 17 | 0.4250000 |
| Master | Yes | 23 | 0.5750000 |
| Miss | No | 55 | 0.2956989 |
| Miss | Yes | 131 | 0.7043011 |
| Mr | No | 436 | 0.8433269 |
| Mr | Yes | 81 | 0.1566731 |
| Mrs | No | 26 | 0.2063492 |
| Mrs | Yes | 100 | 0.7936508 |
| Officer | No | 15 | 0.6818182 |
| Officer | Yes | 7 | 0.3181818 |
Trick
1. 사람들의 평균 Age
2. Title이라는 정보를 활용해서 Title 별 Median Age로 채우기
- Fill missing values : Age
#library(doBy)
#summaryBy(Age ~ Title, data=test, FUN=c(mean, sd, median), na.rm=TRUE)
full %>%
group_by(title) %>% # class별 분리
summarise(mean_age = mean(age, na.rm=TRUE), # age 평균
sd_age = sd(age, na.rm=TRUE), # age 합계
median_age = median(age, na.rm=TRUE), # age 중앙값
n = n()) # 탑승객 수
## # A tibble: 5 x 5
## title mean_age sd_age median_age n
## <chr> <dbl> <dbl> <dbl> <int>
## 1 Master 5.48 4.16 4 61
## 2 Miss 22.0 12.3 22 266
## 3 Mr 32.3 12.4 29 757
## 4 Mrs 36.9 12.9 35 198
## 5 Officer 45.3 11.5 48 27
full$age <- ifelse((is.na(full$age) & full$title == 'Master'), 4, full$age)
full$age <- ifelse((is.na(full$age) & full$title == 'Miss'), 22, full$age)
full$age <- ifelse((is.na(full$age) & full$title == 'Mr'), 29, full$age)
full$age <- ifelse((is.na(full$age) & full$title == 'Mrs'), 35, full$age)
full$age <- ifelse((is.na(full$age) & full$title == 'Officer'), 48, full$age)
- Binining : Age Group
full <- full %>%
mutate(
age = ifelse(is.na(age), mean(full$age, na.rm=TRUE), age),
`age_group` = case_when(age < 13 ~ "Age.0012",
age >= 13 & age < 18 ~ "Age.1317",
age >= 18 & age < 37 ~ "Age.1836",
age >= 37 & age < 61 ~ "Age.3760",
age >= 61 ~ "Age.61Ov"))
Fill missing values : Embarked
- Use the most common code to replace NAs in the Embarked feature.
full$embarked <- replace(full$embarked, which(is.na(full$embarked)), 'S')
Trick
결측치를 처리하는 (통계적인 추론) 방법
Fare는 Ticket class와 상관관계가 있다.
등급별 Fare의 median값으로 missing field를 채워줄 것이다.
- Fill missing values : Fare
full %>%
group_by(pclass) %>% # class별 분리
summarise(mean_fare = mean(fare, na.rm=TRUE), # age 평균
sd_fare = sd(fare, na.rm=TRUE), # age 합계
median_fare = median(fare, na.rm=TRUE), # age 중앙값
cnt = n()) # 탑승객 수
## # A tibble: 3 x 5
## pclass mean_fare sd_fare median_fare cnt
## <int> <dbl> <dbl> <dbl> <int>
## 1 1 87.5 80.4 60 323
## 2 2 21.2 13.6 15.0 277
## 3 3 13.3 11.5 8.05 709
full$fare <- ifelse((is.na(full$fare) & full$pclass == 1), 60, full$fare)
full$fare <- ifelse((is.na(full$fare) & full$pclass == 2), 15, full$fare)
full$fare <- ifelse((is.na(full$fare) & full$pclass == 3), 8.05, full$fare)
full['cabin'] <- lapply(full['cabin'], function(x) str_extract(x, substr(x, 1, 1)))
full$family_size <- full$sibsp + full$parch + 1
full$family_sized[full$family_size == 1] <- 'Single'
full$family_sized[full$family_size < 5 & full$family_size >= 2] <- 'Small'
full$family_sized[full$family_size >= 5] <- 'Big'
full$family_sized <- as.factor(full$family_sized)
##Engineer features based on all the passengers with the same ticket
ticket_unique <- rep(0, nrow(full))
tickets <- unique(full$ticket)
for (i in 1:length(tickets)) {
current.ticket <- tickets[i]
party.indexes <- which(full$ticket == current.ticket)
for (k in 1:length(party.indexes)) {
ticket_unique[party.indexes[k]] <- length(party.indexes)
}
}
full$ticket_unique <- ticket_unique
full$ticket_size[full$ticket_unique == 1] <- 'Single'
full$ticket_size[full$ticket_unique < 5 & full$ticket_unique>= 2] <- 'Small'
full$ticket_size[full$ticket_unique >= 5] <- 'Big'
ggplot(full %>% filter(set=="train"), aes(pclass, fill=survived)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Class") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(sex, fill=survived)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Sex") +
theme_minimal()
tbl_age <- full %>%
filter(set=="train") %>%
select(age, survived) %>%
group_by(survived) %>%
summarise(mean_age = mean(age, na.rm=TRUE))
ggplot(full %>% filter(set=="train"), aes(age, fill=survived)) +
geom_histogram(aes(y=..density..), alpha=0.5) +
geom_density(alpha=.2, aes(colour=survived)) +
geom_vline(data=tbl_age, aes(xintercept=mean_age, colour=survived), lty=2, size=1) +
scale_fill_brewer(palette="Set1") +
scale_colour_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Density") +
ggtitle("Survival Rate by Age") +
theme_minimal()
ggplot(full %>% filter(set=="train" & !is.na(age)), aes(age_group, fill=survived)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Age Group") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(sibsp, fill=survived)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by SibSp") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(parch, fill=survived)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Parch") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(embarked, fill=survived)) +
geom_bar(position = "fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Embarked") +
theme_minimal()
ggplot(full %>% filter(set=="train") %>% na.omit, aes(title, fill=survived)) +
geom_bar(position="fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Title") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(full %>% filter(set=="train") %>% na.omit, aes(family_size, fill=survived)) +
geom_bar(position="fill") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
ylab("Survival Rate") +
geom_hline(yintercept=crude_survrate, col="white", lty=2, size=2) +
ggtitle("Survival Rate by Family Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(full %>% filter(set=="train"), aes(pclass, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Class") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(sex, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Sex") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(age, fill=survived)) +
geom_histogram(aes(y=..count..), alpha=0.5) +
geom_vline(data=tbl_age, aes(xintercept=mean_age, colour=survived), lty=2, size=1) +
scale_fill_brewer(palette="Set1") +
scale_colour_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Density") +
ggtitle("Survived by Age") +
theme_minimal()
ggplot(full %>% filter(set=="train" & !is.na(age)), aes(age_group, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Age Group") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(sibsp, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=percent) +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by SibSp") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(parch, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Parch") +
theme_minimal()
ggplot(full %>% filter(set=="train"), aes(embarked, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Embarked") +
theme_minimal()
ggplot(full %>% filter(set=="train") %>% na.omit, aes(title, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Title") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(full %>% filter(set=="train") %>% na.omit, aes(family_size, fill=survived)) +
geom_bar(position="stack") +
scale_fill_brewer(palette="Set1") +
scale_y_continuous(labels=comma) +
ylab("Passengers") +
ggtitle("Survived by Family Group") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
full <- full[, -c(1, 4, 6, 7, 8, 9)]
str(full)
## Classes 'tbl_df', 'tbl' and 'data.frame': 1309 obs. of 13 variables:
## $ survived : chr "No" "Yes" "Yes" "Yes" ...
## $ pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ sex : chr "male" "female" "female" "female" ...
## $ fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ cabin : chr NA "C" NA "C" ...
## $ embarked : chr "S" "C" "S" "S" ...
## $ set : chr "train" "train" "train" "train" ...
## $ title : chr "Mr" "Mrs" "Miss" "Mrs" ...
## $ age_group : chr "Age.1836" "Age.3760" "Age.1836" "Age.1836" ...
## $ family_size : num 2 2 1 2 1 1 1 5 3 2 ...
## $ family_sized : Factor w/ 3 levels "Big","Single",..: 3 3 2 3 2 2 2 1 3 3 ...
## $ ticket_unique: num 1 2 1 2 1 1 2 5 3 2 ...
## $ ticket_size : chr "Single" "Small" "Single" "Small" ...
feature vector map :
- Master : 0
- Miss : 1
- Mr : 2
- Mrs: 3
- Officer : 4
full$title <- ifelse(full$title == "Master", 0,
ifelse(full$title == "Miss", 1,
ifelse(full$title == "Mr", 2,
ifelse(full$title == "Mrs", 3, 4))))
feature vector map :
- C : 0
- Q : 1
- S : 2
full$embarked <- ifelse(full$embarked == "C", 0,
ifelse(full$embarked == "Q", 1, 2))
feature vector map :
- Age.0012 : 0
- Age.1317 : 1
- Age.1836 : 2
- Age.3760 : 3
- Age.61Ov : 4
full$age_group <- ifelse(full$age_group == "Age.0012", 0,
ifelse(full$age_group == "Age.1317", 1,
ifelse(full$age_group == "Age.1836", 2,
ifelse(full$age_group == "Age.3760", 3, 4))))
feature vector map :
- Single : 0
- Small : 1
- Big : 2
full$family_sized <- ifelse(full$family_sized == "Single", 0,
ifelse(full$family_sized == "Small", 1, 2))
feature vector map :
- Single : 0
- Small : 1
- Big : 2
full$ticket_size <- ifelse(full$ticket_size == "Single", 0,
ifelse(full$ticket_size == "Small", 1, 2))
feature vector map :
- male : 0
- female : 1
full$sex <- ifelse(full$sex == "male", 0, 1)
feature vector map :
- No : 0
- Yes : 1
full$survived <- ifelse(full$survived == "No", 0, 1)
feature vector map:
- 17달러 이하 : 0
- 17~30달러 : 1
- 30~100 달러 : 2
- 100달러 이상 : 3
full$fare <- ifelse(full$fare <= 17, 0,
ifelse(full$fare > 17 & full$fare <= 30, 1,
ifelse(full$fare > 30 & full$fare <= 100, 2, 3)))
NOTE
소수점을 사용하는 이유는 기본적으로 ML 분류기는 숫자를 사용하고, 계산할 때 보통 유클리디안 거리를 쓴다.
숫자의 범위가 비슷하지 않으면, 큰 거리에 있는 것을 조금 더 중요하게 생각할 수 있다.
예를 들면, 성별은 0, 1 밖에 없는데 티켓의 가격은 10달러, 20달러 티켓이 있다고 가정해보자.
10 달러, 20달러 티켓은 3등급이라고 생각해보면, 별로 다를 게 없다. 하지만 남자와 여자는 완전히 다르지만 1-0 이라 1 차이이고, 10달러 20달러는 차이는 10이다. 우리가 같은 range를 주지 않았을 경우 바보같은 Machine Learning Classifier는 10달러 차이가 남자하고 여자 차이보다 더 크게 느껴질 것이다. 해서 범위를 비슷하게 주기 위해서 소수점을 사용했다. 이 테크닉을 feature Scailng이라고 한다.
full$cabin <- ifelse(full$cabin == "A", 0,
ifelse(full$cabin == "B", 0.4,
ifelse(full$cabin == "C", 0.8,
ifelse(full$cabin == "D", 1.2,
ifelse(full$cabin == "E", 1.6,
ifelse(full$cabin == "F", 2,
ifelse(full$cabin == "G", 2.4,2.8)))))))
full %>%
group_by(pclass) %>% # class별 분리
summarise(mean_cabin = mean(cabin, na.rm=TRUE), # age 평균
sd_cabin = sd(cabin, na.rm=TRUE), # age 합계
median_cabin = median(cabin, na.rm=TRUE), # age 중앙값
cnt = n()) # 탑승객 수
# fill missing Fare with median fare for each Pclass
full$cabin <- ifelse((is.na(full$cabin) & full$pclass == 1), 0.8, full$cabin)
full$cabin <- ifelse((is.na(full$cabin) & full$pclass == 2), 2.0, full$cabin)
full$cabin <- ifelse((is.na(full$cabin) & full$pclass == 3), 2.0, full$cabin)
tbl_corr <- full %>%
filter(set=="train") %>%
select_if(is.numeric) %>%
cor(use="complete.obs") %>%
corrplot.mixed(tl.cex=0.85)
rmse <- function(yi, yhat_i){
sqrt(mean((yi - yhat_i)^2))
}
binomial_deviance <- function(y_obs, yhat){
epsilon = 0.0001
yhat = ifelse(yhat < epsilon, epsilon, yhat)
yhat = ifelse(yhat > 1-epsilon, 1-epsilon, yhat)
a = ifelse(y_obs==0, 0, y_obs * log(y_obs/yhat))
b = ifelse(y_obs==1, 0, (1-y_obs) * log((1-y_obs)/(1-yhat)))
return(2*sum(a + b))
}
panel.cor <- function(x, y, digits = 2, prefix = "", cex.cor, ...){
usr <- par("usr"); on.exit(par(usr))
par(usr = c(0, 1, 0, 1))
r <- abs(cor(x, y))
txt <- format(c(r, 0.123456789), digits = digits)[1]
txt <- paste0(prefix, txt)
if(missing(cex.cor)) cex.cor <- 0.8/strwidth(txt)
text(0.5, 0.5, txt, cex = cex.cor * r)
}
#full$survived <- as.factor(full$survived)
##full$pclass <- as.factor(full$pclass)
full$sex <- as.factor(full$sex)
#full$fare <- as.factor(full$fare)
full$cabin <- as.factor(full$cabin)
full$embarked <- as.factor(full$embarked)
full$title <- as.factor(full$title)
full$age_group <- as.factor(full$age_group)
##full$family_size <- as.factor(full$family_size)
full$family_sized <- as.factor(full$family_sized)
##full$ticket_unique <- as.factor(full$ticket_unique)
#full$ticket_size <- as.factor(full$ticket_size)
train <- full[full$set == 'train',]
train <- train[, -7]
prop.table(table(train$survived))
##
## 0 1
## 0.6161616 0.3838384
str(train)
## Classes 'tbl_df', 'tbl' and 'data.frame': 891 obs. of 12 variables:
## $ survived : num 0 1 1 1 0 0 0 0 1 1 ...
## $ pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ sex : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 1 2 2 ...
## $ fare : num 0 2 0 2 0 0 2 1 0 2 ...
## $ cabin : Factor w/ 8 levels "0","0.4","0.8",..: 6 3 6 3 6 6 5 6 6 6 ...
## $ embarked : Factor w/ 3 levels "0","1","2": 3 1 3 3 3 2 3 3 3 1 ...
## $ title : Factor w/ 5 levels "0","1","2","3",..: 3 4 2 4 3 3 3 1 4 4 ...
## $ age_group : Factor w/ 5 levels "0","1","2","3",..: 3 4 3 3 3 3 4 1 3 2 ...
## $ family_size : num 2 2 1 2 1 1 1 5 3 2 ...
## $ family_sized : Factor w/ 3 levels "0","1","2": 2 2 1 2 1 1 1 3 2 2 ...
## $ ticket_unique: num 1 2 1 2 1 1 2 5 3 2 ...
## $ ticket_size : num 0 1 0 1 0 0 1 2 1 1 ...
head(train)
## # A tibble: 6 x 12
## survived pclass sex fare cabin embarked title age_group family_size
## <dbl> <int> <fct> <dbl> <fct> <fct> <fct> <fct> <dbl>
## 1 0 3 0 0 2 2 2 2 2
## 2 1 1 1 2 0.8 0 3 3 2
## 3 1 3 1 0 2 2 1 2 1
## 4 1 1 1 2 0.8 2 3 2 2
## 5 0 3 0 0 2 2 2 2 1
## 6 0 3 0 0 2 1 2 2 1
## # ... with 3 more variables: family_sized <fct>, ticket_unique <dbl>,
## # ticket_size <dbl>
### lets prepare and keep data in the proper format
feature <- train
response <- as.factor(train$survived)
feature$survived <- as.factor(train$survived)
#feauter1<-full[1:891, -1]
#response <- as.factor(train$survived)
#feauter1$survived=as.factor(train$survived)
### For Cross validation purpose will keep 20% of data aside from my orginal train set
## This is just to check how well my data works for unseen data
set.seed(1234)
ind <- createDataPartition(feature$survived, times=1, p=0.8, list=FALSE)
train_val <- feature[ind, ]
test_val <- feature[-ind, ]
####check the proprtion of Survival rate in orginal training data, current traing and testing data
round(prop.table(table(train$survived)*100),digits = 1)
##
## 0 1
## 0.6 0.4
round(prop.table(table(train_val$survived)*100),digits = 1)
##
## 0 1
## 0.6 0.4
round(prop.table(table(test_val$survived)*100),digits = 1)
##
## 0 1
## 0.6 0.4
##Random forest is for more better than Single tree however single tree is very easy to use and illustrate
set.seed(1234)
Model_DT=rpart(survived~.,data=train_val,method="class")
rpart.plot(Model_DT,extra = 3,fallen.leaves = T)
###Surprise, Check out the plot, our Single tree model is using only Title, Pclass and Ticket.size and vomited rest
###Lets Predict train data and check the accuracy of single tree
PRE_TDT=predict(Model_DT,data=train_val,type="class")
confusionMatrix(PRE_TDT,train_val$survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 400 78
## 1 40 196
##
## Accuracy : 0.8347
## 95% CI : (0.8054, 0.8612)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6412
## Mcnemar's Test P-Value : 0.0006589
##
## Sensitivity : 0.9091
## Specificity : 0.7153
## Pos Pred Value : 0.8368
## Neg Pred Value : 0.8305
## Prevalence : 0.6162
## Detection Rate : 0.5602
## Detection Prevalence : 0.6695
## Balanced Accuracy : 0.8122
##
## 'Positive' Class : 0
##
#####Accuracy is 0.8375
####Not at all bad using Single tree and just 3 feauters
##There is chance of overfitting in Single tree, So I will go for cross validation using '10 fold techinque'
set.seed(1234)
cv.10 <- createMultiFolds(train_val$survived, k = 10, times = 10)
# Control
ctrl <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
index = cv.10)
train_val <- as.data.frame(train_val)
##Train the data
Model_CDT <- train(x = train_val[,-1], y = train_val[,1], method = "rpart", tuneLength = 30,
trControl = ctrl)
##Check the accurcay
##Accurcay using 10 fold cross validation of Single tree is 0.8139
##Seems Overfitted earlier using Single tree, there our accurcay rate is 0.83
# check the variable imporatnce, is it the same as in Single tree?
rpart.plot(Model_CDT$finalModel,extra = 3,fallen.leaves = T)
##Yes, there is no change in the imporatnce of variable
###Lets cross validate the accurcay using data that kept aside for testing purpose
PRE_VDTS=predict(Model_CDT$finalModel,newdata=test_val,type="class")
confusionMatrix(PRE_VDTS,test_val$survived, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 23
## 1 15 45
##
## Accuracy : 0.7853
## 95% CI : (0.7174, 0.8434)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 1.07e-06
##
## Kappa : 0.536
## Mcnemar's Test P-Value : 0.2561
##
## Sensitivity : 0.6618
## Specificity : 0.8624
## Pos Pred Value : 0.7500
## Neg Pred Value : 0.8034
## Prevalence : 0.3842
## Detection Rate : 0.2542
## Detection Prevalence : 0.3390
## Balanced Accuracy : 0.7621
##
## 'Positive' Class : 1
##
###There it is, How exactly our train data and test data matches in accuracy (0.8192)
col_names <- names(train_val)
train_val[col_names] <- lapply(train_val[col_names] , factor)
test_val[col_names] <- lapply(test_val[col_names] , factor)
## Training
dt <- rpart(as.factor(survived)~., data = train_val, cp = 0.1^20) # 모든 변수 사용, Full tree 생성
xerror_min_which <- which.min(dt$cptable[, "xerror"])
xerror_min <- min(dt$cptable[, "xerror"])
printcp(dt) # cptable 출력
##
## Classification tree:
## rpart(formula = as.factor(survived) ~ ., data = train_val, cp = 0.1^20)
##
## Variables actually used in tree construction:
## [1] age_group cabin embarked family_size fare
## [6] pclass ticket_unique title
##
## Root node error: 274/714 = 0.38375
##
## n= 714
##
## CP nsplit rel error xerror xstd
## 1 4.7080e-01 0 1.00000 1.00000 0.047424
## 2 9.8540e-02 1 0.52920 0.52920 0.039232
## 3 9.1241e-03 2 0.43066 0.44891 0.036825
## 4 7.2993e-03 5 0.40146 0.46350 0.037292
## 5 6.0827e-03 6 0.39416 0.45255 0.036943
## 6 3.6496e-03 10 0.36496 0.43066 0.036221
## 7 1.0000e-20 11 0.36131 0.41606 0.035721
plotcp(dt) # cpplot 출력
abline(v = xerror_min_which, lty = 2, col = "red")
text(xerror_min_which, xerror_min, labels = round(xerror_min_which, 2), pos = 3, col = "red")
# pruning
dt_prune <- prune(dt, cp = dt$cptable[which.min(dt$cptable[, "xerror"]), "CP"])
# training accuracy
pred_tr_dt <- predict(dt_prune, type = "class") # class(범주형)으로 예측
CrossTable(x = train_val$survived, y = pred_tr_dt, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 714
##
##
## | pred_tr_dt
## train_val$survived | 0 | 1 | Row Total |
## -------------------|-----------|-----------|-----------|
## 0 | 405 | 35 | 440 |
## | 0.920 | 0.080 | 0.616 |
## | 0.864 | 0.143 | |
## | 0.567 | 0.049 | |
## -------------------|-----------|-----------|-----------|
## 1 | 64 | 210 | 274 |
## | 0.234 | 0.766 | 0.384 |
## | 0.136 | 0.857 | |
## | 0.090 | 0.294 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 469 | 245 | 714 |
## | 0.657 | 0.343 | |
## -------------------|-----------|-----------|-----------|
##
##
confusionMatrix(train_val$survived, pred_tr_dt)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 405 35
## 1 64 210
##
## Accuracy : 0.8613
## 95% CI : (0.8338, 0.8859)
## No Information Rate : 0.6569
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.7009
## Mcnemar's Test P-Value : 0.004891
##
## Sensitivity : 0.8635
## Specificity : 0.8571
## Pos Pred Value : 0.9205
## Neg Pred Value : 0.7664
## Prevalence : 0.6569
## Detection Rate : 0.5672
## Detection Prevalence : 0.6162
## Balanced Accuracy : 0.8603
##
## 'Positive' Class : 0
##
# validation accuracy
pred_te_dt <- predict(dt_prune, test_val, type = "class")
CrossTable(x = test_val$survived, y = pred_te_dt, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 177
##
##
## | pred_te_dt
## test_val$survived | 0 | 1 | Row Total |
## ------------------|-----------|-----------|-----------|
## 0 | 94 | 15 | 109 |
## | 0.862 | 0.138 | 0.616 |
## | 0.803 | 0.250 | |
## | 0.531 | 0.085 | |
## ------------------|-----------|-----------|-----------|
## 1 | 23 | 45 | 68 |
## | 0.338 | 0.662 | 0.384 |
## | 0.197 | 0.750 | |
## | 0.130 | 0.254 | |
## ------------------|-----------|-----------|-----------|
## Column Total | 117 | 60 | 177 |
## | 0.661 | 0.339 | |
## ------------------|-----------|-----------|-----------|
##
##
confusionMatrix(test_val$survived, pred_te_dt, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 94 15
## 1 23 45
##
## Accuracy : 0.7853
## 95% CI : (0.7174, 0.8434)
## No Information Rate : 0.661
## P-Value [Acc > NIR] : 0.0002046
##
## Kappa : 0.536
## Mcnemar's Test P-Value : 0.2561450
##
## Sensitivity : 0.7500
## Specificity : 0.8034
## Pos Pred Value : 0.6618
## Neg Pred Value : 0.8624
## Prevalence : 0.3390
## Detection Rate : 0.2542
## Detection Prevalence : 0.3842
## Balanced Accuracy : 0.7767
##
## 'Positive' Class : 1
##
# plotting
plot(dt_prune, margin = 0.1)
text(dt_prune, use.n = T)
fancyRpartPlot(dt_prune, cex = 1) #fancy tree
dt_prune$variable.importance
## title sex family_size ticket_unique ticket_size
## 113.588063 97.143475 64.015037 60.299877 53.966070
## family_sized cabin pclass embarked fare
## 33.146229 17.475853 9.947168 6.778094 6.625211
## age_group
## 4.907818
barplot(dt_prune$variable.importance, ylim = c(0, 55))
set.seed(1234)
rf_1 <- randomForest(x=train_val[,-1], y=train_val[,1], ntree = 500, importance = TRUE)
rf_1
##
## Call:
## randomForest(x = train_val[, -1], y = train_val[, 1], ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 16.53%
## Confusion matrix:
## 0 1 class.error
## 0 396 44 0.100000
## 1 74 200 0.270073
varImpPlot(rf_1)
# training accuracy
pred_tr_rf <- predict(rf_1, type = "class") # class(범주형)으로 예측
CrossTable(x = train_val$survived, y = pred_tr_rf, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 714
##
##
## | pred_tr_rf
## train_val$survived | 0 | 1 | Row Total |
## -------------------|-----------|-----------|-----------|
## 0 | 396 | 44 | 440 |
## | 0.900 | 0.100 | 0.616 |
## | 0.843 | 0.180 | |
## | 0.555 | 0.062 | |
## -------------------|-----------|-----------|-----------|
## 1 | 74 | 200 | 274 |
## | 0.270 | 0.730 | 0.384 |
## | 0.157 | 0.820 | |
## | 0.104 | 0.280 | |
## -------------------|-----------|-----------|-----------|
## Column Total | 470 | 244 | 714 |
## | 0.658 | 0.342 | |
## -------------------|-----------|-----------|-----------|
##
##
confusionMatrix(train_val$survived, pred_tr_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 396 44
## 1 74 200
##
## Accuracy : 0.8347
## 95% CI : (0.8054, 0.8612)
## No Information Rate : 0.6583
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6432
## Mcnemar's Test P-Value : 0.007593
##
## Sensitivity : 0.8426
## Specificity : 0.8197
## Pos Pred Value : 0.9000
## Neg Pred Value : 0.7299
## Prevalence : 0.6583
## Detection Rate : 0.5546
## Detection Prevalence : 0.6162
## Balanced Accuracy : 0.8311
##
## 'Positive' Class : 0
##
####Random Forest accurcay rate is 82.91 which is 1% better than the decison tree
####Lets remove 2 redaundant varibles and do the modeling again
train_val1=train_val[,-9]
test_val1=test_val[,-9]
set.seed(1234)
rf_2 <- randomForest(x = train_val1[,-1], y=train_val1[,1], importance = TRUE, ntree = 500)
rf_2
##
## Call:
## randomForest(x = train_val1[, -1], y = train_val1[, 1], ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 3
##
## OOB estimate of error rate: 15.97%
## Confusion matrix:
## 0 1 class.error
## 0 400 40 0.09090909
## 1 74 200 0.27007299
varImpPlot(rf_2)
# training accuracy
pred_tr_rf <- predict(rf_2, type = "class") # class(범주형)으로 예측
CrossTable(x = train_val1$survived, y = pred_tr_rf, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 714
##
##
## | pred_tr_rf
## train_val1$survived | 0 | 1 | Row Total |
## --------------------|-----------|-----------|-----------|
## 0 | 400 | 40 | 440 |
## | 0.909 | 0.091 | 0.616 |
## | 0.844 | 0.167 | |
## | 0.560 | 0.056 | |
## --------------------|-----------|-----------|-----------|
## 1 | 74 | 200 | 274 |
## | 0.270 | 0.730 | 0.384 |
## | 0.156 | 0.833 | |
## | 0.104 | 0.280 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 474 | 240 | 714 |
## | 0.664 | 0.336 | |
## --------------------|-----------|-----------|-----------|
##
##
confusionMatrix(train_val1$survived, pred_tr_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 400 40
## 1 74 200
##
## Accuracy : 0.8403
## 95% CI : (0.8114, 0.8665)
## No Information Rate : 0.6639
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6543
## Mcnemar's Test P-Value : 0.001997
##
## Sensitivity : 0.8439
## Specificity : 0.8333
## Pos Pred Value : 0.9091
## Neg Pred Value : 0.7299
## Prevalence : 0.6639
## Detection Rate : 0.5602
## Detection Prevalence : 0.6162
## Balanced Accuracy : 0.8386
##
## 'Positive' Class : 0
##
train_val1=train_val[,-c(4, 8, 9)]
test_val1=test_val[,-c(4, 8, 9)]
set.seed(1234)
rf_3 <- randomForest(x = train_val1[,-1], y=train_val1[,1], importance = TRUE, ntree = 500)
rf_3
##
## Call:
## randomForest(x = train_val1[, -1], y = train_val1[, 1], ntree = 500, importance = TRUE)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 2
##
## OOB estimate of error rate: 16.53%
## Confusion matrix:
## 0 1 class.error
## 0 394 46 0.1045455
## 1 72 202 0.2627737
varImpPlot(rf_3)
# training accuracy
pred_tr_rf <- predict(rf_3, type = "class") # class(범주형)으로 예측
CrossTable(x = train_val1$survived, y = pred_tr_rf, prop.chisq = FALSE)
##
##
## Cell Contents
## |-------------------------|
## | N |
## | N / Row Total |
## | N / Col Total |
## | N / Table Total |
## |-------------------------|
##
##
## Total Observations in Table: 714
##
##
## | pred_tr_rf
## train_val1$survived | 0 | 1 | Row Total |
## --------------------|-----------|-----------|-----------|
## 0 | 394 | 46 | 440 |
## | 0.895 | 0.105 | 0.616 |
## | 0.845 | 0.185 | |
## | 0.552 | 0.064 | |
## --------------------|-----------|-----------|-----------|
## 1 | 72 | 202 | 274 |
## | 0.263 | 0.737 | 0.384 |
## | 0.155 | 0.815 | |
## | 0.101 | 0.283 | |
## --------------------|-----------|-----------|-----------|
## Column Total | 466 | 248 | 714 |
## | 0.653 | 0.347 | |
## --------------------|-----------|-----------|-----------|
##
##
confusionMatrix(train_val1$survived, pred_tr_rf)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 394 46
## 1 72 202
##
## Accuracy : 0.8347
## 95% CI : (0.8054, 0.8612)
## No Information Rate : 0.6527
## P-Value [Acc > NIR] : < 2e-16
##
## Kappa : 0.6442
## Mcnemar's Test P-Value : 0.02137
##
## Sensitivity : 0.8455
## Specificity : 0.8145
## Pos Pred Value : 0.8955
## Neg Pred Value : 0.7372
## Prevalence : 0.6527
## Detection Rate : 0.5518
## Detection Prevalence : 0.6162
## Balanced Accuracy : 0.8300
##
## 'Positive' Class : 0
##
###Can see the Magic now, increase in accuracy by just removing 2 varibles, accuracy now is 84.03
##Even though random forest is so power full we accept the model only after cross validation
set.seed(1234)
str(train_val1)
## 'data.frame': 714 obs. of 9 variables:
## $ survived : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 2 1 1 1 ...
## $ pclass : Factor w/ 3 levels "1","2","3": 3 1 1 3 3 2 3 3 3 3 ...
## $ sex : Factor w/ 2 levels "0","1": 1 2 2 1 1 2 2 1 1 2 ...
## $ cabin : Factor w/ 8 levels "0","0.4","0.8",..: 6 3 3 6 6 6 7 6 6 6 ...
## $ embarked : Factor w/ 3 levels "0","1","2": 3 1 3 2 3 1 3 3 3 3 ...
## $ title : Factor w/ 5 levels "0","1","2","3",..: 3 4 4 3 1 4 2 3 3 2 ...
## $ family_sized : Factor w/ 3 levels "0","1","2": 2 2 2 1 3 2 2 1 3 1 ...
## $ ticket_unique: Factor w/ 9 levels "1","2","3","4",..: 1 2 2 1 5 2 3 1 7 1 ...
## $ ticket_size : Factor w/ 3 levels "0","1","2": 1 2 2 1 3 2 2 1 3 1 ...
train_val1=train_val[,-c(4, 8, 9)]
test_val1=test_val[,-c(4, 8, 9)]
cv10_1 <- createMultiFolds(train_val1[,1], k = 10, times = 10)
# Set up caret's trainControl object per above.
ctrl_1 <- trainControl(method = "repeatedcv", number = 10, repeats = 10,
index = cv10_1)
set.seed(1234)
rf_5<- train(x = train_val1[,-1], y = train_val1[,1], method = "rf", tuneLength = 3,
ntree = 1000, trControl = ctrl_1)
rf_5
## Random Forest
##
## 714 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 10 times)
## Summary of sample sizes: 643, 643, 643, 642, 643, 642, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.8391510 0.6544539
## 5 0.8343192 0.6430520
## 8 0.8265884 0.6274052
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 2.
#varImpPlot(rf_5)
contrasts(train_val1$sex)
## 1
## 0 0
## 1 1
contrasts(train_val1$pclass)
## 2 3
## 1 0 0
## 2 1 0
## 3 0 1
##The above shows how the varible coded among themself
##Lets run Logistic regression model
log.mod <- glm(survived ~ ., family = binomial(link=logit),
data = train_val1)
###Check the summary
summary(log.mod)
##
## Call:
## glm(formula = survived ~ ., family = binomial(link = logit),
## data = train_val1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8764 -0.5266 -0.3496 0.5166 2.4618
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.47033 0.95863 3.620 0.000295 ***
## pclass2 -0.04453 1.14880 -0.039 0.969083
## pclass3 -0.99944 1.16479 -0.858 0.390868
## sex1 17.18762 1681.75344 0.010 0.991846
## cabin0.4 0.35673 0.86484 0.412 0.679986
## cabin0.8 -0.25195 0.76154 -0.331 0.740760
## cabin1.2 0.31247 0.92611 0.337 0.735814
## cabin1.6 1.09430 0.89835 1.218 0.223180
## cabin2 -0.96694 1.35794 -0.712 0.476424
## cabin2.4 -1.90552 1.73908 -1.096 0.273207
## cabin2.8 -15.76804 2399.54482 -0.007 0.994757
## embarked1 -0.13457 0.47598 -0.283 0.777395
## embarked2 -0.55939 0.31307 -1.787 0.073977 .
## title1 -17.54588 1681.75355 -0.010 0.991676
## title2 -3.70897 0.62304 -5.953 2.63e-09 ***
## title3 -17.14748 1681.75357 -0.010 0.991865
## title4 -4.14024 0.91839 -4.508 6.54e-06 ***
## family_sized1 -0.21628 0.38813 -0.557 0.577364
## family_sized2 -2.50889 0.74586 -3.364 0.000769 ***
## ticket_unique2 0.11997 0.41144 0.292 0.770602
## ticket_unique3 0.61875 0.49083 1.261 0.207445
## ticket_unique4 0.74948 0.60959 1.229 0.218890
## ticket_unique5 -0.77590 0.87926 -0.882 0.377535
## ticket_unique6 -1.83054 0.96693 -1.893 0.058338 .
## ticket_unique7 0.34595 0.84263 0.411 0.681392
## ticket_unique8 2.15651 0.70993 3.038 0.002384 **
## ticket_unique11 -13.76404 725.80974 -0.019 0.984870
## ticket_size1 NA NA NA NA
## ticket_size2 NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 950.86 on 713 degrees of freedom
## Residual deviance: 535.14 on 687 degrees of freedom
## AIC: 589.14
##
## Number of Fisher Scoring iterations: 15
confint(log.mod)
## 2.5 % 97.5 %
## (Intercept) 1.5716111 5.37206152
## pclass2 -1.3810259 2.32578391
## pclass3 -3.2526216 1.39595272
## sex1 -244.9150370 NA
## cabin0.4 -1.3084211 2.13293615
## cabin0.8 -1.1662756 1.33792737
## cabin1.2 -0.8396348 0.07707872
## cabin1.6 -0.6297922 2.94335286
## cabin2 -3.6941865 1.69563454
## cabin2.4 -5.3827621 1.52462384
## cabin2.8 NA 472.23124091
## embarked1 -1.0745859 0.79480653
## embarked2 -1.1718764 0.05746282
## title1 NA 243.49361589
## title2 -5.0110498 -2.54791191
## title3 NA 243.67365086
## title4 -6.0837947 -2.43860495
## family_sized1 -0.9853201 0.53884235
## family_sized2 -4.0198938 -1.07384868
## ticket_unique2 -0.6994021 0.91638285
## ticket_unique3 -0.3436438 1.58327221
## ticket_unique4 -0.4383857 1.95292694
## ticket_unique5 -2.5211296 0.93027841
## ticket_unique6 -3.8024762 0.05323992
## ticket_unique7 -1.4213413 1.92449458
## ticket_unique8 0.6864282 3.53030870
## ticket_unique11 NA 48.32803683
## ticket_size1 NA NA
## ticket_size2 NA NA
###Predict train data
train.probs <- predict(log.mod, data=train_val1,type = "response")
table(train_val1$survived,train.probs>0.5)
##
## FALSE TRUE
## 0 386 54
## 1 59 215
(386+218)/(386+218+54+56)
## [1] 0.8459384
###Logistic regression predicted train data with accuracy rate of 0.83
test.probs <- predict(log.mod, newdata=test_val1,type = "response")
table(test_val1$survived,test.probs>0.5)
##
## FALSE TRUE
## 0 87 22
## 1 18 50
(96+50)/(96+50+13+18)
## [1] 0.8248588
###Accuracy rate of test data is 0.8135