library(tidyverse)
## ── Attaching packages ───────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse 1.3.0 ──
## ✔ ggplot2 3.2.1     ✔ purrr   0.3.3
## ✔ tibble  2.1.3     ✔ dplyr   0.8.3
## ✔ tidyr   1.0.0     ✔ stringr 1.4.0
## ✔ readr   1.3.1     ✔ forcats 0.4.0
## ── Conflicts ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(tidymodels)
## Registered S3 method overwritten by 'xts':
##   method     from
##   as.zoo.xts zoo
## ── Attaching packages ──────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels 0.0.3 ──
## ✔ broom     0.5.2     ✔ recipes   0.1.7
## ✔ dials     0.0.4     ✔ rsample   0.0.5
## ✔ infer     0.5.1     ✔ yardstick 0.0.4
## ✔ parsnip   0.0.4
## ── Conflicts ─────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard()   masks purrr::discard()
## ✖ dplyr::filter()     masks stats::filter()
## ✖ recipes::fixed()    masks stringr::fixed()
## ✖ dplyr::lag()        masks stats::lag()
## ✖ dials::margin()     masks ggplot2::margin()
## ✖ yardstick::spec()   masks readr::spec()
## ✖ recipes::step()     masks stats::step()
## ✖ recipes::yj_trans() masks scales::yj_trans()
library(keras)
## 
## Attaching package: 'keras'
## The following object is masked from 'package:yardstick':
## 
##     get_weights

Load data

train <- read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/train.csv")
## Parsed with column specification:
## cols(
##   PassengerId = col_double(),
##   Survived = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )
test <- read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/test.csv")
## Parsed with column specification:
## cols(
##   PassengerId = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )

Fix variable types

train <- train %>% 
  select(-Name, -Cabin, -Ticket) %>% 
  mutate(Sex = as.factor(Sex),
         Survived = as.factor(Survived),
         Pclass = as.factor(Pclass),
         Embarked = as.factor(Embarked))
         

test <- test %>% 
  select(-Name, -Cabin, -Ticket) %>% 
  mutate(Sex = as.factor(Sex),
         Pclass = as.factor(Pclass),
         Embarked = as.factor(Embarked))

Impute missing data

titanic_prep <- recipe(Survived ~., data = train) %>%
  step_meanimpute(Fare) %>%
  step_medianimpute(Age, SibSp, Parch) %>%
  step_modeimpute(Sex, Embarked) %>%
  step_knnimpute(Pclass) %>%
  step_dummy(all_nominal(), -Survived) %>% 
  step_normalize(all_predictors()) %>% 
  prep() 

juiced <- juice(titanic_prep) 

baked <- bake(titanic_prep, new_data = test)

Keras processing

x_train <- juiced %>% select(-PassengerId, - Survived) %>% as.matrix() #input needs to be matrix it seems
x_test <- baked  %>% select(-PassengerId, - Survived) %>% as.matrix()

y_train <- juiced$Survived
y_train <- to_categorical(y_train)

Define a model

model <- keras_model_sequential() 

model %>% 
  layer_dense(units = 256, activation = 'relu', input_shape = c(9)) %>% 
  layer_dropout(rate = 0.4) %>% 
  layer_dense(units = 128, activation = 'relu') %>%
  layer_dropout(rate = 0.3) %>%
  layer_dense(units = 2, activation = 'softmax')

model %>% compile(
  loss = 'categorical_crossentropy',
  optimizer = optimizer_rmsprop(),
  metrics = c('accuracy')
)

Train a model

history <- model %>% fit(
  x_train, y_train, 
  epochs = 30, batch_size = 128, 
  validation_split = 0.2
)


plot(history)

Predict

predictions <- model %>% predict_classes(x_test)

predictions
##   [1] 0 0 0 0 0 0 1 0 1 0 0 0 1 0 1 1 0 0 0 0 0 0 1 1 1 0 1 0 0 0 0 0 0 0 0 0 0
##  [38] 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 1 1 0 0 0 0 0 1 0 0 0 1 0 1 1 0 0 1 1 0 0 0
##  [75] 1 0 0 1 0 1 1 0 0 0 0 0 1 0 1 1 0 0 1 0 0 0 1 0 0 0 1 0 0 0 1 0 0 0 0 0 0
## [112] 1 1 1 1 0 0 1 0 1 1 0 1 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
## [149] 0 0 1 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 1 0 0 1 0 1
## [186] 0 1 0 0 0 0 0 0 0 1 0 1 0 0 0 1 1 0 1 0 0 1 0 1 0 0 0 0 1 0 0 1 0 1 0 1 0
## [223] 1 0 1 1 0 1 0 0 0 1 0 0 0 0 0 0 1 1 1 1 0 0 0 0 1 0 1 1 1 0 0 0 0 0 0 0 1
## [260] 0 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0
## [297] 1 0 0 0 0 0 0 0 1 1 0 1 0 0 0 0 0 1 1 1 0 0 0 0 0 0 0 0 1 0 1 0 0 0 1 0 0
## [334] 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 1 0 0 0 1 0 1 0 0 0 0 1 1 0 1 0 0 1 1 0
## [371] 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 1 0 1 0 0 1 0 1 1 0 0 0
## [408] 0 1 1 1 1 0 0 1 0 0 0
original_test <- read_csv("https://raw.githubusercontent.com/agconti/kaggle-titanic/master/data/test.csv")
## Parsed with column specification:
## cols(
##   PassengerId = col_double(),
##   Pclass = col_double(),
##   Name = col_character(),
##   Sex = col_character(),
##   Age = col_double(),
##   SibSp = col_double(),
##   Parch = col_double(),
##   Ticket = col_character(),
##   Fare = col_double(),
##   Cabin = col_character(),
##   Embarked = col_character()
## )
test_predictions <- cbind(original_test, predictions)

write_csv(test_predictions, "test_predictions.csv")