I will be trying out to predict survived passenger on the Titanic using Ctree algorithms. For the first, load all the necessary packages that we will use during analyses.
library(dplyr)
library(ggplot2)
library(tidyr)
library(stringr)
library(caret)
# Load Data
train <- read.csv("train.csv", stringsAsFactors = F)
test <- read.csv("test.csv", stringsAsFactors = F)
complete <- bind_rows(train, test)
tibble(complete)
## # A tibble: 1,309 x 1
## complete$Passen~ $Survived $Pclass $Name $Sex $Age $SibSp $Parch
## <int> <int> <int> <chr> <chr> <dbl> <int> <int>
## 1 1 0 3 Brau~ male 22 1 0
## 2 2 1 1 Cumi~ fema~ 38 1 0
## 3 3 1 3 Heik~ fema~ 26 0 0
## 4 4 1 1 Futr~ fema~ 35 1 0
## 5 5 0 3 Alle~ male 35 0 0
## 6 6 0 3 Mora~ male NA 0 0
## 7 7 0 1 McCa~ male 54 0 0
## 8 8 0 3 Pals~ male 2 3 1
## 9 9 1 3 John~ fema~ 27 0 2
## 10 10 1 2 Nass~ fema~ 14 1 0
## # ... with 1,299 more rows, and 4 more variables: $Ticket <chr>,
## # $Fare <dbl>, $Cabin <chr>, $Embarked <chr>
# Cleaing Data
complete <-
complete %>%
mutate(familySize = 1 + SibSp + Parch,
Title = gsub('(.*, )|(\\..*)', '', Name),
Cabin = gsub('[0-9].*', '', Cabin)) %>%
select(-SibSp, -Parch, -Ticket, -Name)
# Title Name
rare_title = c('Capt', 'Col', 'Don', 'Jonkheer',
'Lady', 'Major', 'Rev', 'Sir',
'the Countess', 'Dr', 'Mile',
'Ms', 'Dona', 'Mme', 'Mlle')
complete$Title[complete$Title %in% rare_title] <- 'Other'
# Cabin Name
cabin_name = c('E', 'F', 'F E', 'F G', 'F', 'T')
complete$Cabin[complete$Cabin %in% cabin_name] <- 'EFGT'
complete$Cabin[complete$Cabin==''] <- 'Blank'
I enriched the information from the title on the variable name, cleared the cabin variable and calculated the family size.
# Missing Value
complete$Age[is.na(complete$Age)] <- median(complete$Age, na.rm=TRUE)
complete$Fare[is.na(complete$Fare)] <- median(complete$Fare, na.rm=TRUE)
I have addressed the missing value in the variable Age and Fare by filling a median value.
# Change Data Type
complete <-
complete %>%
mutate(Survived = as.factor(Survived),
Pclass = as.factor(Pclass),
Sex = as.factor(Sex),
Cabin = as.factor(Cabin),
Title = as.factor(Title),
Embarked = as.factor(Embarked))
Here I change the variable with the data type string into a factor.
# Split Data
train <- complete[c(1:891),]
test <- complete[c(892:1309),]
summary(train)
## PassengerId Survived Pclass Sex Age
## Min. : 1.0 0:549 1:216 female:314 Min. : 0.42
## 1st Qu.:223.5 1:342 2:184 male :577 1st Qu.:22.00
## Median :446.0 3:491 Median :28.00
## Mean :446.0 Mean :29.36
## 3rd Qu.:668.5 3rd Qu.:35.00
## Max. :891.0 Max. :80.00
##
## Fare Cabin Embarked familySize Title
## Min. : 0.00 A : 15 : 2 Min. : 1.000 Master: 40
## 1st Qu.: 7.91 B : 47 C:168 1st Qu.: 1.000 Miss :182
## Median : 14.45 Blank:687 Q: 77 Median : 1.000 Mr :517
## Mean : 32.20 C : 59 S:644 Mean : 1.905 Mrs :125
## 3rd Qu.: 31.00 D : 33 3rd Qu.: 2.000 Other : 27
## Max. :512.33 EFGT : 46 Max. :11.000
## G : 4
I split the complete data that has been modified into train and test data as before.
# Pclass
ggplot(data = train) +
aes(x = Pclass, fill = Survived) +
geom_bar(position = "dodge") +
scale_fill_viridis_d(option = "viridis") +
theme_minimal() +
theme(legend.position = 'bottom')
# Sex
ggplot(data = train) +
aes(x = Sex, fill = Survived) +
geom_bar(position = "dodge") +
scale_fill_viridis_d(option = "viridis") +
theme_minimal() +
theme(legend.position = 'bottom')
# Cabin
ggplot(data = train) +
aes(x = Cabin, fill = Survived) +
geom_bar(position = "dodge") +
scale_fill_viridis_d(option = "viridis") +
theme_minimal() +
theme(legend.position = 'bottom')
# Family Size
ggplot(data = train) +
aes(x = familySize, fill = Survived) +
geom_histogram(bins = 30) +
scale_fill_viridis_d(option = "viridis") +
theme_minimal() +
theme(legend.position = 'bottom')
# Title
ggplot(data = train) +
aes(x = Title, fill = Survived) +
geom_bar(position = "dodge") +
scale_fill_viridis_d(option = "viridis") +
theme_minimal() +
theme(legend.position = 'bottom')
ggplot()+
theme_minimal()
Pclass 3 many did not survived this incident. Many male passengers are not survived compared to female. Many cabinless passengers did not survive. passenger. Most have a family size below 3. Passengers with the title Mr. many who did not survived.
# Make Model for Predict
fitControl <- trainControl(method = "cv", number = 10,
savePredictions = TRUE)
# Ctree Model Train
ctree <- train(Survived ~ ., data = train[2:10],
method = 'ctree',
metric = 'Accuracy',
trControl = fitControl)
ctree
## Conditional Inference Tree
##
## 891 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 802, 803, 802, 802, 801, 802, ...
## Resampling results across tuning parameters:
##
## mincriterion Accuracy Kappa
## 0.01 0.8080099 0.5811429
## 0.50 0.8204202 0.6066400
## 0.99 0.8193088 0.6063194
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mincriterion = 0.5.
plot(varImp(ctree))
confusionMatrix(ctree)
## Cross-Validated (10 fold) Confusion Matrix
##
## (entries are percentual average cell counts across resamples)
##
## Reference
## Prediction 0 1
## 0 56.0 12.3
## 1 5.6 26.0
##
## Accuracy (average) : 0.8204
I made a model with caret package. with fit control cross-validation 10 and ctree models. Accuracy of 0.81 is obtained on average.
# Predicted
ctree.pred <- predict(ctree, test)
# Submission
submission <- data.frame(PassengerId = c(892:1309),
Survived = ctree.pred)
glimpse(submission)
## Observations: 418
## Variables: 2
## $ PassengerId <int> 892, 893, 894, 895, 896, 897, 898, 899, 900, 901, ...
## $ Survived <fct> 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0,...
#write.csv(submission, file = "ctree_submission.csv", row.names=FALSE)
Finally, I make predictions on test data and submit submissions to Kaggle. I got an accuracy score of 0.79 with the username Ahsan Amri Rohman.