ggplot(aes(x=Sex), data=train)+
geom_bar(stat="count", aes(fill=factor(Survived)),position = "dodge")+
theme_few() +
scale_fill_discrete(name="Survived")
ggplot(aes(x=Age), data=train)+
geom_histogram(aes(fill=factor(Survived)))+
theme_few() +
scale_fill_discrete(name="Survived")
ggplot(aes(x=Pclass,fill=factor(Survived)),data=train) +
geom_bar(stat="count") +
theme_few() +
scale_fill_discrete(name="Survived") +
facet_wrap(~Sex)
# Extract titles
full$Title <- gsub('(.*, )|(\\..*)', '', full$Name)
# Reassign rare titles
officer <- c('Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev')
royalty <- c('Dona', 'Lady', 'the Countess','Sir', 'Jonkheer')
# Reassign mlle, ms, and mme, and rare
full$Title[full$Title == 'Mlle'] <- 'Miss'
full$Title[full$Title == 'Ms'] <- 'Miss'
full$Title[full$Title == 'Mme'] <- 'Mrs'
full$Title[full$Title %in% royalty] <- 'Royalty'
full$Title[full$Title %in% officer] <- 'Officer'
ggplot(aes(x=factor(Title), fill=factor(Survived)), data=full[1:891,]) +
geom_bar(stat="count") +
labs(x="Title") +
theme_few()
# Family Size
full$Fsize <- full$SibSp + full$Parch + 1
full$FsizeD[full$Fsize == 1] <- 'Alone'
full$FsizeD[full$Fsize < 5 & full$Fsize > 1] <- 'Small'
full$FsizeD[full$Fsize > 4] <- 'Big'
ggplot(full[1:891,], aes(x = Fsize, fill = factor(Survived))) +
geom_bar(stat='count', position='dodge') +
scale_x_continuous(breaks=c(1:11)) +
labs(x = 'Family Size') +
theme_few()
full$Child[full$Age < 18] <- 'Child'
full$Child[full$Age >= 18] <- 'Adult'
full$Child = as.factor(full$Child)
## Need to impute NA of variabl "Age" first.
ggplot(aes(x=Child,fill=factor(Survived)),data=full[1:891,])+
geom_bar(stat="count", position = "dodge") +
scale_fill_discrete(name="Survived") +
labs(x="",y="",title="Child/Adult vs Survived") +
theme_few()
which(full$Embarked == "")
## [1] 62 830
ggplot(aes(x=Embarked, fill=factor(Survived)),data=full[1:891,]) +
geom_bar(stat="count") +
facet_wrap(~Pclass) +
scale_fill_discrete(name="Survived")
#分布於三種class的中位數資料都對應於在S港登船
#tapply(full$Embarked, full$Pclass,median, na.rm=TRUE)
#full[c(62, 830), 'Embarked']
full$Embarked[c(62, 830)] <- 'S'
which(is.na(full$Fare))
## [1] 1044
full[1044, ]
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 1044 1044 NA 3 Storey, Mr. Thomas male 60.5 0 0
## Ticket Fare Cabin Embarked Title Fsize FsizeD Child
## 1044 3701 NA S Mr 1 Alone Adult
ggplot(full[full$Pclass == '3', ],
aes(x = Fare)) +
geom_density(fill = 'lightgrey', alpha=0.4) +
geom_vline(aes(xintercept=median(Fare, na.rm=T)),
colour='darkred', linetype='dashed', lwd=1) +
xlab('Fare') +
ggtitle("Pclass = 3")+
ylab("Density") +
theme_few()
full$Fare[1044] <- median(full[full$Pclass == '3' & full$Embarked == 'S', ]$Fare, na.rm = TRUE)
tapply(full$Fare, full$Pclass,median, na.rm=TRUE)
## 1 2 3
## 60.0000 15.0458 8.0500
sum(is.na(full$Age))
## [1] 263
# Make variables factors into factors
factor_vars <- c('Pclass','Sex','Embarked',
'Title','FsizeD')
full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))
set.seed(123)
mice_mod = mice(full[, !names(full) %in% c('PassengerId','Name','Ticket','Cabin','Survived')], method='rf')
##
## iter imp variable
## 1 1 Age Child
## 1 2 Age Child
## 1 3 Age Child
## 1 4 Age Child
## 1 5 Age Child
## 2 1 Age Child
## 2 2 Age Child
## 2 3 Age Child
## 2 4 Age Child
## 2 5 Age Child
## 3 1 Age Child
## 3 2 Age Child
## 3 3 Age Child
## 3 4 Age Child
## 3 5 Age Child
## 4 1 Age Child
## 4 2 Age Child
## 4 3 Age Child
## 4 4 Age Child
## 4 5 Age Child
## 5 1 Age Child
## 5 2 Age Child
## 5 3 Age Child
## 5 4 Age Child
## 5 5 Age Child
mice_output = complete(mice_mod)
full$Age = mice_output$Age
full$Child[full$Age < 18] <- 'Child'
full$Child[full$Age >= 18] <- 'Adult'
full$Child = as.factor(full$Child)
train <- full[1:891,]
test <- full[892:1309,]
rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Age + Fare + Embarked + Title + FsizeD + Child,data = train)
importance <- importance(rf_model)
varImportance <- data.frame(Variables = row.names(importance),
Importance = round(importance[,'MeanDecreaseGini'],2))
# Create a rank variable based on importance
rankImportance <- varImportance %>%
mutate(Rank = paste0('#',dense_rank(desc(Importance))))
# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance),
y = Importance, fill = Importance)) +
geom_bar(stat='identity') +
geom_text(aes(x = Variables, y = 0.5, label = Rank),
hjust=0, vjust=0.55, size = 4, colour = 'red') +
labs(x = 'Variables') +
coord_flip() +
theme_few()
pred = predict(rf_model,train)
result = table(pred,train$Survived)
result
##
## pred 0 1
## 0 526 75
## 1 23 267
# Precision
sum(diag(result))/sum(result)
## [1] 0.8900112
# Predict using the test set
prediction <- predict(rf_model, test)
# Save the solution to a dataframe with two columns: PassengerId and Survived (prediction)
solution <- data.frame(PassengerID = test$PassengerId, Survived = prediction)
# Write the solution to file
write.csv(solution, file = 'Solution.csv', row.names = F)