Loading the Libraries

library(xgboost)
library(rpart)

Gettting and Cleaning the Data

# if you're using Rstudio Cloud
train <- read.csv("train.csv")
test <- read.csv("test.csv")

# if you're using AWS Rstudio
train <- read.csv("/home/rstudio/data/train.csv")
test <- read.csv("/home/rstudio/data/test.csv")

# if you're working locally with a project in your Google Drive Data Science folder
train <- read.csv("..Data_Science_Data/titanic/train.csv")
test <- read.csv("..Data_Science_Data/titanic/test.csv")
feature_eng <- function(train_df, test_df) {
  # Combining the train and test sets for purpose engineering
  test_df$Survived <- NA
  combi <- rbind(train_df, test_df) 
  
  #Features engineering
  combi$Name <- as.character(combi$Name)
  
  # The number of titles are reduced to reduce the noise in the data
  combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
  combi$Title <- sub(' ', '', combi$Title)
  #table(combi$Title)
  combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
  combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
  combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
  combi$Title <- factor(combi$Title)
  
  # Reuniting the families together
  combi$FamilySize <- combi$SibSp + combi$Parch + 1
  combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
  combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
  combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
  #table(combi$FamilyID)
  combi$FamilyID <- factor(combi$FamilyID)
  
  
  # Decision trees model to fill in the missing Age values
  Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data=combi[!is.na(combi$Age),], method="anova")
  combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
  
  # Fill in the Embarked and Fare missing values
  #which(combi$Embarked == '')
  combi$Embarked[c(62,830)] = "S"
  combi$Embarked <- factor(combi$Embarked)
  #which(is.na(combi$Fare))
  combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)
  
  # Creating a new familyID2 variable that reduces the factor level of falilyID so that the random forest model
  # can be used
  combi$FamilyID2 <- combi$FamilyID
  combi$FamilyID2 <- as.character(combi$FamilyID2)
  combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
  combi$FamilyID2 <- factor(combi$FamilyID2)
  
  return(combi)
}

# Calling the engineering function
data <- feature_eng(train, test)

# Creating a dataframe containing only columns that interest us and converting the data into numerics in order to use xgboost
combi2 <- data[, -c(1,4,9, 11, 15,17)]

xgboost specific data cleaning

# Converting factors to numerics and making the variables start at 0 since this is a requirement of the xgboost package
combi2$Pclass <- as.numeric(combi2$Pclass)-1
combi2$Sex <- as.numeric(combi2$Sex) -1
combi2$Embarked <- as.numeric(combi2$Embarked) -1
combi2$Title <- as.numeric(combi2$Title) -1
combi2$FamilySize <- as.numeric(combi2$FamilySize) -1
combi2$FamilyID <- as.numeric(combi2$FamilyID) -1

# convert the new dataframe into a matrix 
combi2 <- as.matrix(combi2)

Splitting back to train and test sets

train <- combi2[1:891,]
test <- combi2[892:1309,]

Making the Trees!

(This is the part that you’ll want to fiddle with! Try different sets of parameters.)

param <- list("objective" = "binary:logistic", eta=0.1, 
              subsample=0.5, max_depth=6)



fit_xgboost <- xgboost(param =param, data = train[, -c(1)], label = train[, c(1)], nrounds=15)

Looking at the important variables

# Get the feature real names
names <- dimnames(train[, -c(1)])[[2]]

# Compute feature importance matrix
importance_matrix <- xgb.importance(names, model = fit_xgboost)

# Plotting
xgb.plot.importance(importance_matrix)

Submitting to Kaggle

# Prediction on test and train sets
pred_xgboost_test <- predict(fit_xgboost, test[, -c(1)])
pred_xgboost_train <- predict(fit_xgboost, train[, -c(1)])

# a look at the distribution of survival chances
hist(pred_xgboost_test)


# Creating the submitting file
submit <- data.frame(PassengerId = data[892:1309,c("PassengerId")], Survived = round(pred_xgboost_test))
write.csv(submit, file = "firstxgboost.csv", row.names = FALSE)