We will explore the Titanic dataset and build a model that will predict whether or not a passenger survived or not.
library(dplyr)
library(ggplot2)
library(rpart)
library(rpart.plot)
library(caret)
library(randomForest)
train <- read.csv("C:\\Users\\Vadim_Katsemba\\Downloads\\Titanictrain.csv")
test <- read.csv("C:\\Users\\Vadim_Katsemba\\Downloads\\Titanictest.csv")
test$Survived <- NA
all_data <- rbind(train, test)
ggplot(all_data[1:891,], aes(Age, fill = factor(Survived))) + geom_histogram(bins = 30) + xlab("Age") + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Age")
## Warning: Removed 177 rows containing non-finite values (stat_bin).
ggplot(all_data[1:891,], aes(Sex, fill = factor(Survived))) + geom_bar(stat = "count", position = "dodge") + xlab("Sex") + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Sex")
ggplot(all_data[1:891,], aes(Age, fill = factor(Survived))) + geom_histogram(bins = 30) + xlab("Age") + facet_grid(.~Sex) + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Age and Sex")
## Warning: Removed 177 rows containing non-finite values (stat_bin).
ggplot(all_data[1:891,], aes(Pclass, fill = factor(Survived))) + geom_bar(stat = "count") + xlab("Pclass") + facet_grid(.~Sex) + scale_fill_discrete(name = "Survived") + ggtitle("Survivors by Passenger Class and Sex")
ggplot(all_data[1:891,], aes(x = Age, y = Sex)) +
geom_jitter(aes(colour = factor(Survived))) +
facet_wrap(~Pclass) +
labs(x = "Age", y = "Sex", title = "Survivors by Age, Sex and Passenger Class") +
scale_fill_discrete(name = "Survived") +
scale_x_continuous(name = "Age", limits = c(0, 81))
## Warning: Removed 177 rows containing missing values (geom_point).
all_data$Title <- gsub('(.*, )|(\\..*)', '', all_data$Name)
table(all_data$Sex, all_data$Title)
##
## Capt Col Don Dona Dr Jonkheer Lady Major Master Miss Mlle Mme
## female 0 0 0 1 1 0 1 0 0 260 2 1
## male 1 4 1 0 7 1 0 2 61 0 0 0
##
## Mr Mrs Ms Rev Sir the Countess
## female 0 197 2 0 0 1
## male 757 0 0 8 1 0
officer <- c('Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev')
royalty <- c('Dona', 'Lady', 'the Countess','Sir', 'Jonkheer')
all_data$Title[all_data$Title == 'Mlle'] <- 'Miss'
all_data$Title[all_data$Title == 'Ms'] <- 'Miss'
all_data$Title[all_data$Title == 'Mme'] <- 'Mrs'
all_data$Title[all_data$Title %in% royalty] <- 'Royalty'
all_data$Title[all_data$Title %in% officer] <- 'Officer'
ggplot(all_data[1:891,], aes(Title,fill = factor(Survived))) +
geom_bar(stat = "count")+
xlab('Title') +
scale_fill_discrete(name = " Survived") +
ggtitle("Survivors by Title")
all_data$FamilySize <- all_data$SibSp + all_data$Parch + 1
ggplot(all_data[1:891,], aes(x = FamilySize, fill = factor(Survived))) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
xlab('Family Size') +
scale_fill_discrete(name = "Survived") +
ggtitle("Survivors by Family Size")
all_data$Sex <- factor(all_data$Sex)
all_data$Title <- factor(all_data$Title)
all_data$Pclass <- factor(all_data$Pclass)
all_data$FamilySize <- factor(all_data$FamilySize)
all_data$Embarked <- factor(all_data$Embarked)
features <- all_data[1:891, c("Pclass", "Title", "Sex", "Embarked","FamilySize")]
response <- as.factor(train$Survived)
features$Survived <- as.factor(train$Survived)
set.seed(1234)
inTrain <- createDataPartition(features$Survived, times = 1, p = 0.8, list = FALSE)
train_eval = features[inTrain,]
test_eval = features[-inTrain,]
round(prop.table(table(train$Survived)*100), digits = 2)
##
## 0 1
## 0.62 0.38
round(prop.table(table(train_eval$Survived)*100), digits = 2)
##
## 0 1
## 0.62 0.38
round(prop.table(table(test_eval$Survived)*100), digits = 2)
##
## 0 1
## 0.62 0.38
set.seed(1234)
DT_model <- train(Survived ~., data = train_eval, method = "rpart")
library(rattle)
fancyRpartPlot(DT_model$finalModel)
DT_Pred <- predict(DT_model, data = train_eval, type = "raw")
confusionMatrix(DT_Pred, train_eval$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 423 114
## 1 17 160
##
## Accuracy : 0.8165
## 95% CI : (0.7861, 0.8443)
## No Information Rate : 0.6162
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5843
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.9614
## Specificity : 0.5839
## Pos Pred Value : 0.7877
## Neg Pred Value : 0.9040
## Prevalence : 0.6162
## Detection Rate : 0.5924
## Detection Prevalence : 0.7521
## Balanced Accuracy : 0.7727
##
## 'Positive' Class : 0
##
set.seed(1234)
rf_model <- randomForest(x = train_eval[,-6], y = train_eval[, 6], importance = TRUE, ntree = 1000)
rf_predict <- predict(rf_model, newdata = test_eval)
confusionMatrix(rf_predict, test_eval$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 95 23
## 1 14 45
##
## Accuracy : 0.791
## 95% CI : (0.7236, 0.8483)
## No Information Rate : 0.6158
## P-Value [Acc > NIR] : 4.548e-07
##
## Kappa : 0.5469
##
## Mcnemar's Test P-Value : 0.1884
##
## Sensitivity : 0.8716
## Specificity : 0.6618
## Pos Pred Value : 0.8051
## Neg Pred Value : 0.7627
## Prevalence : 0.6158
## Detection Rate : 0.5367
## Detection Prevalence : 0.6667
## Balanced Accuracy : 0.7667
##
## 'Positive' Class : 0
##