Loading the Libraries

library(xgboost)
library(rpart)

Gettting and Cleaning the Data

# if you're using Rstudio Cloud
train <- read.csv("train.csv")
test <- read.csv("test.csv")

# if you're using AWS Rstudio
train <- read.csv("/home/rstudio/data/train.csv")
test <- read.csv("/home/rstudio/data/test.csv")

# if you're working locally with a project in your Google Drive Data Science folder
train <- read.csv("..Data_Science_Data/titanic/train.csv")
test <- read.csv("..Data_Science_Data/titanic/test.csv")

feature_eng <- function(train_df, test_df) {
  # Combining the train and test sets for purpose engineering
  test_df$Survived <- NA
  combi <- rbind(train_df, test_df) 
  
  #Features engineering
  combi$Name <- as.character(combi$Name)
  
  # The number of titles are reduced to reduce the noise in the data
  combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
  combi$Title <- sub(' ', '', combi$Title)
  #table(combi$Title)
  combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
  combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
  combi$Title[combi$Title %in% c('Dona', 'Lady', 'the Countess', 'Jonkheer')] <- 'Lady'
  combi$Title <- factor(combi$Title)
  
  # Reuniting the families together
  combi$FamilySize <- combi$SibSp + combi$Parch + 1
  combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
  combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
  combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
  #table(combi$FamilyID)
  combi$FamilyID <- factor(combi$FamilyID)
  
  
  # Decision trees model to fill in the missing Age values
  Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize, data=combi[!is.na(combi$Age),], method="anova")
  combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
  
  # Fill in the Embarked and Fare missing values
  #which(combi$Embarked == '')
  combi$Embarked[c(62,830)] = "S"
  combi$Embarked <- factor(combi$Embarked)
  #which(is.na(combi$Fare))
  combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)
  
  # Creating a new familyID2 variable that reduces the factor level of falilyID so that the random forest model
  # can be used
  combi$FamilyID2 <- combi$FamilyID
  combi$FamilyID2 <- as.character(combi$FamilyID2)
  combi$FamilyID2[combi$FamilySize <= 3] <- 'Small'
  combi$FamilyID2 <- factor(combi$FamilyID2)
  
  return(combi)
}

# Calling the engineering function
data <- feature_eng(train, test)

# Creating a dataframe containing only columns that interest us and converting the data into numerics in order to use xgboost
combi2 <- data[, -c(1,4,9, 11, 15,17)]

xgboost specific data cleaning

# Converting factors to numerics and making the variables start at 0 since this is a requirement of the xgboost package
combi2$Pclass <- as.numeric(combi2$Pclass)-1
combi2$Sex <- as.numeric(combi2$Sex) -1
combi2$Embarked <- as.numeric(combi2$Embarked) -1
combi2$Title <- as.numeric(combi2$Title) -1
combi2$FamilySize <- as.numeric(combi2$FamilySize) -1
combi2$FamilyID <- as.numeric(combi2$FamilyID) -1

# convert the new dataframe into a matrix 
combi2 <- as.matrix(combi2)

Splitting back to train and test sets

train <- combi2[1:891,]
test <- combi2[892:1309,]

Making the Trees!

(This is the part that you’ll want to fiddle with! Try different sets of parameters.)

param <- list("objective" = "binary:logistic", eta=0.1, 
              subsample=0.5, max_depth=6)



fit_xgboost <- xgboost(param =param, data = train[, -c(1)], label = train[, c(1)], nrounds=15)

Looking at the important variables

# Get the feature real names
names <- dimnames(train[, -c(1)])[[2]]

# Compute feature importance matrix
importance_matrix <- xgb.importance(names, model = fit_xgboost)

# Plotting
xgb.plot.importance(importance_matrix)

Submitting to Kaggle

# Prediction on test and train sets
pred_xgboost_test <- predict(fit_xgboost, test[, -c(1)])
pred_xgboost_train <- predict(fit_xgboost, train[, -c(1)])

# a look at the distribution of survival chances
hist(pred_xgboost_test)


# Creating the submitting file
submit <- data.frame(PassengerId = data[892:1309,c("PassengerId")], Survived = round(pred_xgboost_test))
write.csv(submit, file = "firstxgboost.csv", row.names = FALSE)

Boosting the Titanic

Loading the Libraries

Gettting and Cleaning the Data

xgboost specific data cleaning

Splitting back to train and test sets

Making the Trees!

Looking at the important variables

Submitting to Kaggle