1. EDA

1-1 Sex vs Survived

ggplot(aes(x=Sex), data=train)+
  geom_bar(stat="count", aes(fill=factor(Survived)),position = "dodge")+
  theme_few() +
  scale_fill_discrete(name="Survived")

1-2 Age vs Survived

ggplot(aes(x=Age), data=train)+
  geom_histogram(aes(fill=factor(Survived)))+
  theme_few() +
  scale_fill_discrete(name="Survived")

1-4 Fare vs Survived

ggplot(aes(x=Pclass,fill=factor(Survived)),data=train) +
  geom_bar(stat="count") +
  theme_few() +
  scale_fill_discrete(name="Survived") +
  facet_wrap(~Sex)

2. Feature Engineering

2-1 New Variables: Title

# Extract titles
full$Title <- gsub('(.*, )|(\\..*)', '', full$Name)

# Reassign rare titles
officer <- c('Capt', 'Col', 'Don', 'Dr', 'Major', 'Rev')
royalty <- c('Dona', 'Lady', 'the Countess','Sir', 'Jonkheer')

# Reassign mlle, ms, and mme, and rare
full$Title[full$Title == 'Mlle']        <- 'Miss' 
full$Title[full$Title == 'Ms']          <- 'Miss'
full$Title[full$Title == 'Mme']         <- 'Mrs' 
full$Title[full$Title %in% royalty]  <- 'Royalty'
full$Title[full$Title %in% officer]  <- 'Officer'

ggplot(aes(x=factor(Title), fill=factor(Survived)), data=full[1:891,]) +
  geom_bar(stat="count") +
  labs(x="Title") +
  theme_few()

2-2 New Variable: Fsize (Family Size)

# Family Size
full$Fsize <- full$SibSp + full$Parch + 1

full$FsizeD[full$Fsize == 1] <- 'Alone'
full$FsizeD[full$Fsize < 5 & full$Fsize > 1] <- 'Small'
full$FsizeD[full$Fsize > 4] <- 'Big'

ggplot(full[1:891,], aes(x = Fsize, fill = factor(Survived))) +
  geom_bar(stat='count', position='dodge') +
  scale_x_continuous(breaks=c(1:11)) +
  labs(x = 'Family Size') +
  theme_few()

2-3 New Variable: Child (From Age)

full$Child[full$Age < 18] <- 'Child'
full$Child[full$Age >= 18] <- 'Adult'
full$Child = as.factor(full$Child)

## Need to impute NA of variabl "Age" first.

ggplot(aes(x=Child,fill=factor(Survived)),data=full[1:891,])+
  geom_bar(stat="count", position = "dodge") +
  scale_fill_discrete(name="Survived") +
  labs(x="",y="",title="Child/Adult vs Survived") +
  theme_few()

3. Missingness

3-1 Processing Embarked

which(full$Embarked == "")

## [1]  62 830

ggplot(aes(x=Embarked, fill=factor(Survived)),data=full[1:891,]) +
  geom_bar(stat="count") +
  facet_wrap(~Pclass) +
  scale_fill_discrete(name="Survived")

#分布於三種class的中位數資料都對應於在Ｓ港登船
#tapply(full$Embarked, full$Pclass,median, na.rm=TRUE)
#full[c(62, 830), 'Embarked']

full$Embarked[c(62, 830)] <- 'S'

3-2 Processing Fare

which(is.na(full$Fare))

## [1] 1044

full[1044, ]

##      PassengerId Survived Pclass               Name  Sex  Age SibSp Parch
## 1044        1044       NA      3 Storey, Mr. Thomas male 60.5     0     0
##      Ticket Fare Cabin Embarked Title Fsize FsizeD Child
## 1044   3701   NA              S    Mr     1  Alone Adult

ggplot(full[full$Pclass == '3', ], 
       aes(x = Fare)) +
  geom_density(fill = 'lightgrey', alpha=0.4) + 
  geom_vline(aes(xintercept=median(Fare, na.rm=T)),
             colour='darkred', linetype='dashed', lwd=1) +
  xlab('Fare') +
  ggtitle("Pclass = 3")+
  ylab("Density") +
  theme_few()

full$Fare[1044] <- median(full[full$Pclass == '3' & full$Embarked == 'S', ]$Fare, na.rm = TRUE)
tapply(full$Fare, full$Pclass,median, na.rm=TRUE)

##       1       2       3 
## 60.0000 15.0458  8.0500

3-3 Age

sum(is.na(full$Age))

## [1] 263

# Make variables factors into factors
factor_vars <- c('Pclass','Sex','Embarked',
                 'Title','FsizeD')

full[factor_vars] <- lapply(full[factor_vars], function(x) as.factor(x))

set.seed(123)

mice_mod = mice(full[, !names(full) %in% c('PassengerId','Name','Ticket','Cabin','Survived')], method='rf')

## 
##  iter imp variable
##   1   1  Age  Child
##   1   2  Age  Child
##   1   3  Age  Child
##   1   4  Age  Child
##   1   5  Age  Child
##   2   1  Age  Child
##   2   2  Age  Child
##   2   3  Age  Child
##   2   4  Age  Child
##   2   5  Age  Child
##   3   1  Age  Child
##   3   2  Age  Child
##   3   3  Age  Child
##   3   4  Age  Child
##   3   5  Age  Child
##   4   1  Age  Child
##   4   2  Age  Child
##   4   3  Age  Child
##   4   4  Age  Child
##   4   5  Age  Child
##   5   1  Age  Child
##   5   2  Age  Child
##   5   3  Age  Child
##   5   4  Age  Child
##   5   5  Age  Child

mice_output = complete(mice_mod)
full$Age = mice_output$Age

full$Child[full$Age < 18] <- 'Child'
full$Child[full$Age >= 18] <- 'Adult'
full$Child = as.factor(full$Child)

4. Prediction

4-1 Building Model

train <- full[1:891,]
test <- full[892:1309,]

rf_model <- randomForest(factor(Survived) ~ Pclass + Sex + Age + Fare + Embarked + Title + FsizeD + Child,data = train)

importance    <- importance(rf_model)
varImportance <- data.frame(Variables = row.names(importance), 
  Importance = round(importance[,'MeanDecreaseGini'],2))

# Create a rank variable based on importance
rankImportance <- varImportance %>%
  mutate(Rank = paste0('#',dense_rank(desc(Importance))))

# Use ggplot2 to visualize the relative importance of variables
ggplot(rankImportance, aes(x = reorder(Variables, Importance), 
    y = Importance, fill = Importance)) +
  geom_bar(stat='identity') + 
  geom_text(aes(x = Variables, y = 0.5, label = Rank),
    hjust=0, vjust=0.55, size = 4, colour = 'red') +
  labs(x = 'Variables') +
  coord_flip() + 
  theme_few()

4-2 Predict!

pred = predict(rf_model,train)
result = table(pred,train$Survived)
result

##     
## pred   0   1
##    0 526  75
##    1  23 267

# Precision
sum(diag(result))/sum(result)

## [1] 0.8900112

# Predict using the test set
prediction <- predict(rf_model, test)

# Save the solution to a dataframe with two columns: PassengerId and Survived (prediction)
solution <- data.frame(PassengerID = test$PassengerId, Survived = prediction)

# Write the solution to file
write.csv(solution, file = 'Solution.csv', row.names = F)

Titanic: Machine Learning from Disaster

王長宏 Paul Wang

2017/6/9

1. EDA

1-1 Sex vs Survived

1-2 Age vs Survived

1-4 Fare vs Survived

2. Feature Engineering

2-1 New Variables: Title

2-2 New Variable: Fsize (Family Size)

2-3 New Variable: Child (From Age)

3. Missingness

3-1 Processing Embarked

3-2 Processing Fare

3-3 Age

4. Prediction

4-1 Building Model

4-2 Predict!