Adena Lin
October 28, 2015
test_1 <- test
test_1$Survived <- 0
model_1 <- data.frame(PassengerId = test_1$PassengerId, Survived = test_1$Survived)
write.csv(model_1, file = "model1.csv", row.names = FALSE)
# % Correct
train_1 <- train
train_1$Predicted <- 0
train_1$Correct <- 0
train_1$Correct[train_1$Survived == train_1$Predicted] <- 1
mean(train_1$Correct)
[1] 0.6161616
M1P <- as.factor(train_1$Predicted)
M1S <- as.factor(train_1$Survived)
sens1 <- sensitivity(data = M1P,
reference = M1S,
positive = "1")
spec1 <- specificity(data = M1P,
reference = M1S,
negative = "0")
# J = Sensitivity + Specificity − 1 measures the proportions of correctly predicted samples for both the event and nonevent groups
sens1 + spec1 - 1 # J = 0
[1] 0
test_2 <- test
test_2$Survived <- 0
test_2$Survived[test_2$Sex == "female"] <- 1
model_2 <- data.frame(PassengerId = test_2$PassengerId, Survived = test_2$Survived)
% Correct
[1] 0.7867565
J = Sensitivity + Specificity − 1
[1] 0.5337456
test_3 <- test
# Set age range to classify children
test_3$Child[test_3$Age < 18] <- 1
test_3$Child[test_3$Age >= 18] <- 0
# Set all women & children to "Survived"
test_3$Survived <- 0
test_3$Survived[test_3$Sex == "female"] <- 1
test_3$Survived[test_3$Child == "1"] <- 1
model_3 <- data.frame(PassengerId = test_3$PassengerId, Survived = test_3$Survived)
Preparing the data:
# Combine train & test data
temptest <- test
temptrain <- train
temptest$Survived <- NA
test_4 <- rbind(temptrain, temptest)
# Use common embarkment point & factorize
test_4$Embarked[c(62, 830)] <- "S"
test_4$Embarked <- factor(test_4$Embarked)
# Use median fare
test_4$Fare[1044] <- median(test_4$Fare, na.rm = TRUE)
More data preparation:
# Use decision tree to predict missing ages
predicted_age <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked,
data = test_4[!is.na(test_4$Age),],
method = "anova")
test_4$Age[is.na(test_4$Age)] <- predict(predicted_age, test_4[is.na(test_4$Age),])
# Split data back into train & test sets
train_rf <- test_4[1:891,]
test_rf <- test_4[892:1309,]
Building the Random Forest:
set.seed(3)
my_forest <- randomForest(as.factor(Survived) ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked,
data = train_rf,
importance = TRUE,
ntree=1000)
my_prediction <- predict(my_forest, test_rf)
model_4 <- data.frame(PassengerId = test_rf$PassengerId, Survived = my_prediction)
# Discard PassengerId, Name, Ticket, Cabin
dataset_lm <- subset(train_rf,select=c(2,3,5,6,7,8,10,12))
# Build model, continuous -> class
log_reg <- glm(Survived ~., family = binomial(logit), data = dataset_lm)
mod5pred <- predict(log_reg, test, type = "response")
mod5pred <- ifelse(mod5pred > 0.5, 1, 0)
model_5 <- data.frame(PassengerId = test$PassengerId, Survived = mod5pred)
# Set "NA" to "Died"
model_5$Survived[is.na(model_5$Survived)]<-0
Decision tree with feature engineering:
Getting started:
library(rpart)
ctrain <- train
ctest <- test
ctest$Survived <- NA
dataset <- rbind(ctrain, ctest)
Preparing the data:
# Fill in missing embarkment points
dataset$Embarked[c(62, 830)] <- "S"
dataset$Embarked <- factor(dataset$Embarked)
# Create new "Title" column
dataset$Name <- as.character(dataset$Name)
dataset$Title <- sapply(dataset$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
dataset$Title <- sapply(dataset$Title, FUN=function(x) {strsplit(x, split=' ')[[1]][2]})
dataset[which(dataset$Title=="the"),"Title"] <- "Countess"
dataset$Title <- as.factor(dataset$Title)
# Convert fare to tiers
dataset$Fare[is.na(dataset$Fare)] <- median(dataset$Fare, na.rm=TRUE)
dataset$SpFare[dataset$Fare <= 20] <- 1
dataset$SpFare[dataset$Fare > 20 & dataset$Fare <= 40] <- 2
dataset$SpFare[dataset$Fare > 40 & dataset$Fare <= 60] <- 3
dataset$SpFare[dataset$Fare > 60 & dataset$Fare <= 80] <- 4
dataset$SpFare[dataset$Fare > 80 & dataset$Fare <= 100] <- 5
dataset$SpFare[dataset$Fare > 100 & dataset$Fare <= 150] <- 6
dataset$SpFare[dataset$Fare > 150] <- 7
dataset$SpFare <- factor(dataset$SpFare)
# Create new "Family" column
dataset$Family <- dataset$SibSp + dataset$Parch + 1
# Fill in missing ages
age <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + SpFare + Title + Family,
data = dataset[!is.na(dataset$Age),],
method = "anova")
dataset$Age[is.na(dataset$Age)] <- predict(age, dataset[is.na(dataset$Age),])
# Split back into train & test sets
fetrain <- dataset[1:891,]
fetest <- dataset[892:1309,]
Building the decision tree:
# Generate decision tree
dt <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title + SpFare + Family,
data = fetrain,
method = "class")
fetest$Survived <- predict(dt, fetest, type = "class")
model_9 <- data.frame(PassengerId = fetest$PassengerId, Survived = fetest$Survived)