# references 
# xda package : https://www.r-bloggers.com/introducing-xda-r-package-for-exploratory-data-analysis/
# eda         : http://r4ds.had.co.nz/exploratory-data-analysis.html
library(data.table)
library(dplyr)
library(magrittr)
library(tidyr)
library(stringi)
library(stringr)
library(ggplot2)
library(caret)
library(doMC)
train <- fread("/Users/CA/Downloads/titanic/train.csv", na.strings = "")
test <- fread("/Users/CA/Downloads/titanic/test.csv", na.strings = "")
gender <- fread("/Users/CA/Downloads/titanic/gender_submission.csv", na.strings = "")
head(train)
library(xda)
numSummary(train)
charSummary(train)
# missing value 는 Cabin / Embarked 와 Age 에서 발생하고 있고, 
# Age 는 연령 추정으로, Embarked 는 별도 모름으로, Cabin 은 없음으로 처리하면 좋겠다. 
train %<>% replace_na(list(Embarked = "unknown"))
# 당장 factor type 으로 변환은 Survived, Sex, Embarked, Pclass 이고, 
# Ticket 과 Cabin 은 추가로 살펴봐야 한다. 
train %<>% mutate_each_(funs(factor(.)), c("Sex", "Embarked", "Pclass", "Survived"))
charSummary(train)
# ticket 은 문자 + 숫자로 이뤄져 있으며, 숫자부분은 중복데이터가 발견되고 있다. 
train %<>% 
  separate(col = Ticket, into = c("Ticket.prefix", "Ticket.no"), sep = " ", fill = "left", extra = "drop") %<>%
  replace_na(list(Ticket.prefix = "none"))
train <- transform(train, Ticket.prefix = stri_replace_all_fixed(Ticket.prefix, ".", ""))
charSummary(train)
# ticket no 가 동일한 것들을 디벼보자 
dup_ticket_passengers <- train %>% group_by(Ticket.no) %>% filter(n() > 1)
# ticket no 가 동일한 승객들은, 일행임을 알 수 있다. 이는 추후 연령 추정이나, 최종 생존 예측때 사용될 수 있으니, 
# ticket no 자체를 하나의 factor 로 등록하자, 1개이상의 중복 티켓번호는 각각을 티켓번호로 level 부여, 
# 중복되지 않은 놈들은 그냥 다 퉁쳐서 하나로 level 부여 
group <- subset(dup_ticket_passengers, select = c(PassengerId, Ticket.no))
colnames(group) <- c("PassengerId", "group_no")
group$Ticket.no <- NULL
train <- merge(train, group, by = "PassengerId", all.x = T)
train %<>% replace_na(list(group_no = "none"))
train$group_no <- factor(train$group_no)
train$Ticket.no <- NULL
# cabin 을 디벼보자
library(stringi)
train %<>% replace_na(list(Cabin = "X"))
train <- transform(train, load_count = stri_count(Cabin, regex="\\S+"), Cabin.prefix = stri_sub(Cabin, 0, 1))
train %<>% replace_na(list(load_count = 0)) %>% mutate(Cabin.prefix = ifelse(Cabin.prefix %in% c("G", "T"), "ETC", Cabin.prefix))
table(train$Cabin.prefix)

  1   2   3   4   5   6   9 ETC 
 15  47  59  33  32  13 687   5 
train$Cabin <-NULL
# family_size 를 생성하자
train <- transform(train, family_size = SibSp + Parch)
table(train$family_size)

  0   1   2   3   4   5   6   7  10 
537 161 102  29  15  22  12   6   7 
# fare 를 디벼보자
# numSummary 를 보면 outlier 갯수가 많지만.... 일단 버리지 말고, categorizing 하자. 
plot(density(train$Fare))

plot(density(log(train$Fare)))

train <- transform(train, log_fare = ifelse(round(log(Fare),0) <= 1, 1, round(log(Fare),0)))
train$log_fare <- factor(train$log_fare)
# 1. missing value 처리 : Cabin / Embarked -> unknown
# 2. Cabin  => Cabin.prefix , Load.count
# 3. Ticket => Ticket.prefix , Group.no
# 4. Fare => Log Transformation => log_fare ( 1 ~ 6 )
# 5. Factor type 으로 변경 : Sex , Embarked, PClass, Survived, Cabin.prefix, Ticket.Prefix, Group.no, log_fare
# 6. Family Size 추가 = SibSp + Parch
#
# 7. Name 디비기
#
# 8. Age 추정
# name 을 디벼보자, 이름, 호칭, 성 으로 구성되는 듯 하다. 
name_seperator <- Vectorize(function(name) {
  splitted <- strsplit(name, ",")[[1]][2]
  return(str_trim(str_split(splitted, "\\.")[[1]][1]))
})
train %<>% mutate(Name.call = name_seperator(Name))
table(train$Name.call)

        Capt          Col          Don           Dr     Jonkheer         Lady        Major       Master         Miss         Mlle          Mme 
           1            2            1            7            1            1            2           40          182            2            1 
          Mr          Mrs           Ms          Rev          Sir the Countess 
         517          125            1            6            1            1 
# 호칭 레벨이 너무 많아지는 것을 막기위해, 10개 이하는 etc 로 구분한다. 
name_seperator <- Vectorize(function(name) {
  splitted <- strsplit(name, ",")[[1]][2]
  call <- str_trim(str_split(splitted, "\\.")[[1]][1])
  
  
  return(ifelse(call %in% c('Master', 'Miss', 'Mr', 'Mrs'), call, 'Etc'))
})
train %<>% mutate(Name.call = name_seperator(Name))
table(train$Name.call)

   Etc Master   Miss     Mr    Mrs 
    27     40    182    517    125 
train$Name <- NULL
par(mfrow = c(2,2))
ggplot(train,aes(x=log_fare,group=Survived,fill=Survived))+geom_bar(position="identity",alpha=0.5)+theme_bw()

ggplot(train,aes(x=Cabin.prefix,group=Survived,fill=Survived))+geom_bar(position="identity",alpha=0.5)+theme_bw()

ggplot(train,aes(x=Ticket.prefix,group=Survived,fill=Survived))+geom_bar(position="identity",alpha=0.5)+theme_bw()

ggplot(train,aes(x=Name.call,group=Survived,fill=Survived))+geom_bar(position="identity",alpha=0.5)+theme_bw()

numSummary(train)
charSummary(train)
# Age 추정 
# Age 가 있는 놈들로 모델링해서, missing 인 놈들을 채운다. 
train <- transform(train, Age = ifelse(is.na(Age), -1, ifelse(round(Age / 10,0) >= 6, 6, round(Age / 10,0))))
has_age <- subset(train, Age > 0)
none_age <- setdiff(train, has_age)
has_age$PassengerId <- NULL
has_age.train <- sample_frac(has_age, 0.5)
has_age.test <- setdiff(has_age, has_age.train)
age.fit <- glm(Age ~ . , data = has_age.train)
age.fit.steps <- step(age.fit, direction = "backward", trace = 0)
summary(age.fit.steps)

Call:
glm(formula = Age ~ Pclass + Sex + SibSp + Name.call, data = has_age.train)

Deviance Residuals: 
   Min      1Q  Median      3Q     Max  
-2.316  -0.886  -0.091   0.599   3.114  

Coefficients:
                Estimate Std. Error t value Pr(>|t|)    
(Intercept)       3.0096     0.6082    4.95  1.2e-06 ***
Pclass2          -1.0289     0.1574   -6.54  2.4e-10 ***
Pclass3          -1.2337     0.1415   -8.72  < 2e-16 ***
Sexmale           1.9891     0.7264    2.74   0.0065 ** 
SibSp            -0.2261     0.0931   -2.43   0.0157 *  
Name.callMaster  -2.1945     0.6941   -3.16   0.0017 ** 
Name.callMiss     0.4203     0.6258    0.67   0.5023    
Name.callMr      -0.8788     0.4109   -2.14   0.0332 *  
Name.callMrs      1.5325     0.6233    2.46   0.0145 *  
---
Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

(Dispersion parameter for gaussian family taken to be 1.1)

    Null deviance: 517.89  on 334  degrees of freedom
Residual deviance: 359.07  on 326  degrees of freedom
AIC: 993.9

Number of Fisher Scoring iterations: 2
age.fit <- glm(
  formula = Age ~ Survived + Pclass + Sex + SibSp + Name.call, 
  data = has_age.train, 
  family = "gaussian") 
has_age.test$pred <- predict(age.fit, has_age.test, type = "response")
plot(has_age.test$Age, has_age.test$pred)

has_age <- subset(train, Age != -1)
none_age <- setdiff(train, has_age)

has_age$PassengerId <- NULL
has_age$Ticket.prefix <- NULL
has_age$group_no <- NULL
has_age$Cabin.prefix <- NULL

has_age.train <- sample_frac(has_age, 0.3)
has_age.test <- setdiff(has_age, has_age.train)

has_age.train$Age <- factor(has_age.train$Age)


#registerDoMC(cores = 4)

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)

gbmGrid <-  expand.grid(interaction.depth = c(1,3,5,9), 
                        n.trees = (1:10)*10, 
                        shrinkage = 0.05,
                        n.minobsinnode = 10)

set.seed(825)
age.fit.gbm <- train(Age ~ ., data = subset(has_age.train, select = -c(Survived)), 
                 method = "gbm", 
                 metric = "Accuracy",
                 trControl = fitControl,
                 tuneGrid = gbmGrid,
                 verbose = FALSE)
# The final values used for the model were n.trees = 20, interaction.depth = 3, shrinkage = 0.05 and n.minobsinnode = 10.
plot(age.fit.gbm)
plot(age.fit.gbm, metric = "Accuracy")

has_age.test$pred <- predict(age.fit.gbm, has_age.test)
table(has_age.test$Age, has_age.test$pred)
has_age <- subset(train, Age > 0)
none_age <- setdiff(train, has_age)
has_age$PassengerId <- NULL
has_age$Ticket.prefix <- NULL
has_age$group_no <- NULL
has_age$Cabin.prefix <- NULL
has_age.train <- sample_frac(has_age, 0.5)
has_age.test <- setdiff(has_age, has_age.train)
has_age.train$Age <- factor(has_age.train$Age)
library(C50)
fit.c50 <- C5.0( x = subset(has_age.train, select = -c(Survived, Age)),
                 y = has_age.train$Age, 
                 # an integer specifying the number of boosting iterations. A value of one indicates that a single model is used.
                 # C5.0 uses adaptive boosting 
                 trials = 10,  
                 control = C5.0Control(earlyStopping = F))
print(fit.c50)

Call:
C5.0.default(x = subset(has_age.train, select = -c(Survived, Age)), y = has_age.train$Age, trials = 10, control = C5.0Control(earlyStopping = F))

Classification Tree
Number of samples: 335 
Number of predictors: 10 

Number of boosting iterations: 10 
Average tree size: 26.9 

Non-standard options: attempt to group attributes, early stopping for boosting
has_age.test$pred <- predict(fit.c50, has_age.test)
plot(has_age.test$Age, has_age.test$pred)

table(has_age.test$Age, has_age.test$pred)
   
     1  2  3  4  5  6
  1  5  6  1  0  0  0
  2  6 36 14  8  9  1
  3  0 22 17  7  5  5
  4  0 14 13  8 13  6
  5  0  3  7  4  9  5
  6  0  3  1  6  3  2
# Random Search
control <- trainControl(method="repeatedcv", number=10, repeats=3, search="random")
set.seed(1234)
mtry <- sqrt(ncol(has_age.train))
rf_random <- train(Age ~., data=has_age.train, method="rf", metric="Kappa", tuneLength=15, trControl=control)
Loading required package: randomForest
randomForest 4.6-12
Type rfNews() to see new features/changes/bug fixes.

Attaching package: 'randomForest'

The following object is masked from 'package:ggplot2':

    margin

The following object is masked from 'package:dplyr':

    combine
print(rf_random)
Random Forest 

335 samples
 11 predictor
  6 classes: '1', '2', '3', '4', '5', '6' 

No pre-processing
Resampling: Cross-Validated (10 fold, repeated 3 times) 
Summary of sample sizes: 302, 302, 300, 301, 301, 302, ... 
Resampling results across tuning parameters:

  mtry  Accuracy  Kappa
   1    0.376     0.114
   3    0.431     0.229
   5    0.411     0.214
   6    0.413     0.215
   7    0.404     0.206
   8    0.396     0.197
   9    0.407     0.212
  15    0.414     0.222
  16    0.412     0.220
  18    0.408     0.217
  20    0.402     0.209
  21    0.399     0.207

Kappa was used to select the optimal model using  the largest value.
The final value used for the model was mtry = 3.
plot(rf_random)

non_age <- subset(train, Age == -1)
non_age$Age <- predict(age.fit.gbm, non_age)
train$Age <- factor(train$Age)
predicted_ages <- subset(non_age, select = c(PassengerId, Age))
train %<>% anti_join(predicted_ages, by = "PassengerId") %>% bind_rows(predicted_ages)
Unequal factor levels: coercing to character
train <- na.omit(train)
ggplot(train,aes(x=Age,group=Survived,fill=Survived))+geom_bar(position="identity",alpha=0.5)+theme_bw()

registerDoMC(cores = 4)

fitControl <- trainControl(## 10-fold CV
                           method = "repeatedcv",
                           number = 10,
                           ## repeated ten times
                           repeats = 10)

gbmGrid <-  expand.grid(interaction.depth = c(1,3,5,7,9), 
                        n.trees = (1:10)*50, 
                        shrinkage = 0.01,
                        n.minobsinnode = 5)

set.seed(825)
Survived.fit.gbm <- train(Survived ~ ., data = train, 
                 method = "gbm", 
                 metric = "Accuracy",
                 trControl = fitControl,
                 tuneGrid = gbmGrid,
                 verbose = FALSE)
# The final values used for the model were n.trees = 450, interaction.depth = 3, shrinkage = 0.01 and n.minobsinnode = 5.
plot(Survived.fit.gbm)

library(data.table)
library(dplyr)
library(magrittr)
library(tidyr)
library(stringi)
library(stringr)
library(xda)
library(caret)
library(doMC)
# test 데이터에 대해, 위에 데이터 가공을 하는 함수를 만들자.
name_seperator <- Vectorize(function(name) {
  splitted <- strsplit(name, ",")[[1]][2]
  call <- str_trim(str_split(splitted, "\\.")[[1]][1])
  
  return(ifelse(call %in% c('Master', 'Miss', 'Mr', 'Mrs'), call, 'Etc'))
})
tt <- function(test) {
  test %<>% 
    replace_na(list(Embarked = "unknown")) %>%
    mutate_each_(funs(factor(.)), c("Sex", "Embarked", "Pclass")) %>%
    separate(col = Ticket, into = c("Ticket.prefix", "Ticket.no"), sep = " ", fill = "left", extra = "drop") %>%
    replace_na(list(Ticket.prefix = "none", Fare = 0)) %>%
    mutate(Ticket.prefix = stri_replace_all_fixed(Ticket.prefix, ".", "")) %>% 
    mutate(Ticket.prefix = ifelse(Ticket.prefix %in% c("CA", "PC"), Ticket.prefix ,"ETC"))
    
  dup_ticket_passengers <- test %>% group_by(Ticket.no) %>% filter(n() > 1)
  group <- subset(dup_ticket_passengers, select = c(PassengerId, Ticket.no))
  colnames(group) <- c("PassengerId", "group_no")
  group$Ticket.no <- NULL
  
  test %<>% 
    merge(group, by = "PassengerId", all.x = T) %>%
    replace_na(list(group_no = "none")) %>% 
    mutate_each_(funs(factor(.)), c("group_no")) -> test
  
  test %<>% 
    replace_na(list(Cabin = "X")) %>% 
    mutate(load_count = stri_count(Cabin, regex="\\S+"), Cabin.prefix = stri_sub(Cabin, 0, 1)) %>%
    replace_na(list(load_count = 0)) %>% 
    mutate(Cabin.prefix = ifelse(Cabin.prefix %in% c("G", "T"), "ETC", Cabin.prefix))
  
  test <- transform(test, family_size = SibSp + Parch)
  
  test <- transform(test, log_fare = ifelse(round(log(Fare),0) <= 1, 1, round(log(Fare),0)))
  test$log_fare <- factor(test$log_fare)
  
  test %<>% 
    mutate(Name.call = name_seperator(Name)) %>%
    mutate_each_(funs(factor(.)), c("Name.call"))
  
  test <- transform(test, Age = ifelse(is.na(Age), -1, ifelse(round(Age / 10,0) >= 6, 6, round(Age / 10,0))))
  test %<>% select(-c(Name, Cabin, Ticket.no, Fare))
  
  return(test)
}
train_age_preiction_model <- function(train) {
  train <- subset(train, Age != -1)
  train$Age <- factor(train$Age)
  
  train %<>% select(-c(PassengerId, Ticket.prefix, group_no, Cabin.prefix, Survived))
  fitControl <- trainControl(method = "repeatedcv", number = 10, repeats = 10)
  
  gbmGrid <-  expand.grid(interaction.depth = c(1,3,5,9), 
                          n.trees = (1:10)*10, 
                          shrinkage = 0.05,
                          n.minobsinnode = 5)
  
  set.seed(1234)
  age.fit.gbm <- train(Age ~ ., data = train, 
                   method = "gbm", 
                   metric = "Accuracy",
                   trControl = fitControl,
                   tuneGrid = gbmGrid,
                   verbose = T)
  
  return(age.fit.gbm)
}
train_survival_prediction_model <- function(train) {
  train <- subset(train, Age != -1)
  train$Age <- factor(train$Age)
  train$Survived <- factor(train$Survived)
  
  train %<>% select(-c(group_no))
  
  fitControl <- trainControl(method = "repeatedcv",
                             number = 10,
                             repeats = 10)
  
  gbmGrid <-  expand.grid(interaction.depth = c(1,3,5), 
                          n.trees = (1:10)*50, 
                          shrinkage = 0.05,
                          n.minobsinnode = 5)
  
  set.seed(825)
  Survived.fit.gbm <- train(Survived ~ ., data = train, 
                   method = "gbm", 
                   metric = "Accuracy",
                   trControl = fitControl,
                   tuneGrid = gbmGrid,
                   verbose = T)
  
  return(Survived.fit.gbm)
}
predict_age <- function(test, age.fit) {
      
  non_age <- subset(test, Age == -1)
  non_age$Age <- predict(age.fit, non_age)
  
  test$Age <- factor(test$Age)
  test %<>% anti_join(non_age, by = "PassengerId") %>% bind_rows(non_age)
  test$Age <- factor(test$Age)
  return(test)
}
train <- fread("/Users/CA/Downloads/titanic/train.csv", na.strings = "")
test <- fread("/Users/CA/Downloads/titanic/test.csv", na.strings = "")

refined_train_data <- tt(train)
refined_test_data <- tt(test)

age.fit <- train_age_preiction_model(refined_train_data)
survival.fit <- train_survival_prediction_model(refined_train_data)

refined_test_data          <- predict_age(refined_test_data, age.fit)
refined_test_data$pred     <- predict(survival.fit, refined_test_data)
refined_test_data <- merge(refined_test_data, gender, by = "PassengerId")
column names 'Survived.x', 'Prediction.x', 'Survived.y', 'Prediction.y' are duplicated in the result
confusionMatrix(gender$Survived, refined_test_data$pred)
Confusion Matrix and Statistics

          Reference
Prediction   0   1
         0 248  18
         1   5 147
                                        
               Accuracy : 0.945         
                 95% CI : (0.919, 0.965)
    No Information Rate : 0.605         
    P-Value [Acc > NIR] : <2e-16        
                                        
                  Kappa : 0.883         
 Mcnemar's Test P-Value : 0.0123        
                                        
            Sensitivity : 0.980         
            Specificity : 0.891         
         Pos Pred Value : 0.932         
         Neg Pred Value : 0.967         
             Prevalence : 0.605         
         Detection Rate : 0.593         
   Detection Prevalence : 0.636         
      Balanced Accuracy : 0.936         
                                        
       'Positive' Class : 0             
                                        
