library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1 ✔ purrr 0.3.3
## ✔ tibble 2.1.3 ✔ dplyr 0.8.3
## ✔ tidyr 1.0.0 ✔ stringr 1.4.0
## ✔ readr 1.3.1 ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'xts':
## method from
## as.zoo.xts zoo
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.0.3 ──
## ✔ broom 0.5.2 ✔ recipes 0.1.7
## ✔ dials 0.0.4 ✔ rsample 0.0.5
## ✔ infer 0.5.1 ✔ yardstick 0.0.4
## ✔ parsnip 0.0.4
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dials::margin() masks ggplot2::margin()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## ✖ recipes::yj_trans() masks scales::yj_trans()
library(keras)
##
## Attaching package: 'keras'
## The following object is masked from 'package:yardstick':
##
## get_weights
Load data
train <- read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_double(),
## Survived = col_double(),
## Pclass = col_double(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_double(),
## Parch = col_double(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
test <- read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/test.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_double(),
## Pclass = col_double(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_double(),
## Parch = col_double(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
Fix variable types
train <- train %>%
select(-Name, -Cabin, -Ticket) %>%
mutate(Sex = as.factor(Sex),
Survived = as.factor(Survived),
Pclass = as.factor(Pclass),
Embarked = as.factor(Embarked))
test <- test %>%
select(-Name, -Cabin, -Ticket) %>%
mutate(Sex = as.factor(Sex),
Pclass = as.factor(Pclass),
Embarked = as.factor(Embarked))
Impute missing data
titanic_prep <- recipe(Survived ~., data = train) %>%
step_meanimpute(Fare) %>%
step_medianimpute(Age, SibSp, Parch) %>%
step_modeimpute(Sex, Embarked) %>%
step_knnimpute(Pclass) %>%
step_dummy(all_nominal(), -Survived) %>%
step_normalize(all_predictors()) %>%
prep()
juiced <- juice(titanic_prep)
baked <- bake(titanic_prep, new_data = test)
Keras processing
x_train <- juiced %>% select(-PassengerId, - Survived) %>% as.matrix() #input needs to be matrix it seems
x_test <- baked %>% select(-PassengerId, - Survived) %>% as.matrix()
y_train <- juiced$Survived
y_train <- to_categorical(y_train)
Define a model
model <- keras_model_sequential()
model %>%
layer_dense(units = 256, activation = 'relu', input_shape = c(9)) %>%
layer_dropout(rate = 0.4) %>%
layer_dense(units = 128, activation = 'relu') %>%
layer_dropout(rate = 0.3) %>%
layer_dense(units = 2, activation = 'softmax')
model %>% compile(
loss = 'categorical_crossentropy',
optimizer = optimizer_rmsprop(),
metrics = c('accuracy')
)
Train a model
history <- model %>% fit(
x_train, y_train,
epochs = 30, batch_size = 128,
validation_split = 0.2
)
plot(history)

Predict
predictions <- model %>% predict_classes(x_test)
predictions
## [1] 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0
## [38] 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0
## [75] 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## [112] 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## [149] 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
## [186] 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
## [223] 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
## [260] 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
## [297] 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
## [334] 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0
## [371] 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
## [408] 0 1 1 1 1 0 0 1 0 0 0
original_test <- read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/test.csv")
## Parsed with column specification:
## cols(
## PassengerId = col_double(),
## Pclass = col_double(),
## Name = col_character(),
## Sex = col_character(),
## Age = col_double(),
## SibSp = col_double(),
## Parch = col_double(),
## Ticket = col_character(),
## Fare = col_double(),
## Cabin = col_character(),
## Embarked = col_character()
## )
test_predictions <- cbind(original_test, predictions)
write_csv(test_predictions, "test_predictions.csv")