Titanic

summary(t)

##   PassengerId       Survived          Pclass          Title    
##  Min.   :  1.0   Min.   :0.0000   Min.   :1.000    Mr    :517  
##  1st Qu.:223.5   1st Qu.:0.0000   1st Qu.:2.000    Miss  :182  
##  Median :446.0   Median :0.0000   Median :3.000    Mrs   :125  
##  Mean   :446.0   Mean   :0.3838   Mean   :2.309    Master: 40  
##  3rd Qu.:668.5   3rd Qu.:1.0000   3rd Qu.:3.000    Dr    :  7  
##  Max.   :891.0   Max.   :1.0000   Max.   :3.000    Rev   :  6  
##                                                   (Other): 14  
##      Sex           Age            SibSp           Parch       
##  female:314   Min.   : 0.42   Min.   :0.000   Min.   :0.0000  
##  male  :577   1st Qu.:20.12   1st Qu.:0.000   1st Qu.:0.0000  
##               Median :28.00   Median :0.000   Median :0.0000  
##               Mean   :29.70   Mean   :0.523   Mean   :0.3816  
##               3rd Qu.:38.00   3rd Qu.:1.000   3rd Qu.:0.0000  
##               Max.   :80.00   Max.   :8.000   Max.   :6.0000  
##               NA's   :177                                     
##   Dependancies          Ticket         Fare                Cabin    
##  Min.   : 0.0000   1601    :  7   Min.   :  0.00              :687  
##  1st Qu.: 0.0000   347082  :  7   1st Qu.:  7.91   B96 B98    :  4  
##  Median : 0.0000   CA. 2343:  7   Median : 14.45   C23 C25 C27:  4  
##  Mean   : 0.9046   3101295 :  6   Mean   : 32.20   G6         :  4  
##  3rd Qu.: 1.0000   347088  :  6   3rd Qu.: 31.00   C22 C26    :  3  
##  Max.   :10.0000   CA 2144 :  6   Max.   :512.33   D          :  3  
##                    (Other) :852                    (Other)    :186  
##  Embarked
##   :  2   
##  C:168   
##  Q: 77   
##  S:644   
##          
##          
##

#Add column with quantity for alluvial graphs
t<- t %>%
  mutate(Q=1)

#Rectify data types
str(t)

## 'data.frame':    891 obs. of  14 variables:
##  $ PassengerId : int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived    : int  0 1 1 1 0 0 0 0 1 1 ...
##  $ Pclass      : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 12 8 13 13 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
##  $ Age         : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp       : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch       : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Dependancies: int  1 1 0 1 0 0 0 4 2 1 ...
##  $ Ticket      : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
##  $ Fare        : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin       : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...

t$Survived<-as.factor(t$Survived)
t$Pclass<-as.factor(t$Pclass)

#2 sets, 1 with age and 1 without age
t.age<-na.omit(t)
t.wo.age<-t[which(is.na(t$Age)),]

#Remove Age column for dataset without age
colnames(t.wo.age)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Age"          "SibSp"        "Parch"       
##  [9] "Dependancies" "Ticket"       "Fare"         "Cabin"       
## [13] "Embarked"     "Q"

t.wo.age<-t.wo.age[,-6]

#Plots of data with age
ggplot(data=t.age, aes(t.age$Age, fill=t.age$Survived)) + 
  geom_histogram(breaks=seq(0, 50, by=3))

ggplot(data=t.age, aes(t.age$Pclass, fill=t.age$Survived)) +
  geom_bar()

ggplot(data=t.age, aes(t.age$Sex, fill=t.age$Survived)) +
  geom_bar()

ggplot(data=t.age, aes(t.age$Dependancies, fill=t.age$Survived)) +
  geom_bar()

ggplot(data=t.age, aes(t.age$Fare, fill=t.age$Survived)) +
  geom_histogram(breaks=seq(0,300, by=25))

ggplot(data=t.age, aes(t.age$Embarked, fill=t.age$Survived)) +
  geom_bar()

ggplot(t.age,
       aes(y = Q,
           axis1 = Sex, axis2 = Pclass, axis3 = Survived)) +
  geom_alluvium(aes(fill = Survived)) +
  geom_stratum(width = 1/3) +
  geom_text(stat = "stratum", label.strata = TRUE) +
  scale_x_continuous(breaks = 1:3, labels = c("Gender", "Class", "Survived")) +
  ggtitle("Titanic survival by class and sex")

#Check multi-collinearity, distributions etc.
ggpairs(t.age[,c(2,3,5,6,7,8,9,11)])

#Remove SibSp & Parch
colnames(t.age)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Age"          "SibSp"        "Parch"       
##  [9] "Dependancies" "Ticket"       "Fare"         "Cabin"       
## [13] "Embarked"     "Q"

t.age.nc<-t.age[,-c(7,8)]              
str(t.age.nc)

## 'data.frame':    714 obs. of  12 variables:
##  $ PassengerId : int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 8 13 13 9 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age         : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ Dependancies: int  1 1 0 1 0 0 4 2 1 2 ...
##  $ Ticket      : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 86 396 345 133 617 ...
##  $ Fare        : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin       : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 131 1 1 1 147 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...

#Feature Engineering
#Ticket
summary(t.age.nc$Ticket)

##        347082       3101295        347088       CA 2144        382652 
##             7             6             6             6             5 
##  S.O.C. 14879        113760        113781          1601         19950 
##             5             4             4             4             4 
##          2666        347077        349909          LINE    W./C. 6608 
##             4             4             4             4             4 
##        110152        110413         13502         17421        230080 
##             3             3             3             3             3 
##         24160         29106        345773        347742         35273 
##             3             3             3             3             3 
##        363291    C.A. 31921    C.A. 34651  F.C.C. 13529      PC 17572 
##             3             3             3             3             3 
##      PC 17582      PC 17755      PC 17757      PC 17760 SC/Paris 2123 
##             3             3             3             3             3 
##        111361        113572        113776        113789        113803 
##             2             2             2             2             2 
##        113806         11668         11751         11767         11967 
##             2             2             2             2             2 
##         12749         13507         16966         17474         19877 
##             2             2             2             2             2 
##         19928         19943        220845        230136        230433 
##             2             2             2             2             2 
##        231919        237736        239865        243847        244252 
##             2             2             2             2             2 
##        244367        248727        248738        250644        250647 
##             2             2             2             2             2 
##        250649        250655         26360          2651          2653 
##             2             2             2             2             2 
##          2659          2691          2699         28403          2908 
##             2             2             2             2             2 
##         29750       3101278         31027        345764        347054 
##             2             2             2             2             2 
##        347080        349237         35281        364516        364849 
##             2             2             2             2             2 
##         36928         36947         36973        370129        392096 
##             2             2             2             2             2 
##         54636          7534     A/4 48871     A/5. 3336     C.A. 2315 
##             2             2             2             2             2 
##     C.A. 2673    C.A. 33112    C.A. 37671     P/PP 3381       (Other) 
##             2             2             2             2           455

temp<-t.age$Ticket
y <- gsub("\\d+", "", temp)
y<-data.frame(y)
y$ticket<-if_else(y$y=="","Numeric","Alpha Numeric")
t.age.nc$ticket<-y$ticket
colnames(t.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Age"          "Dependancies" "Ticket"      
##  [9] "Fare"         "Cabin"        "Embarked"     "Q"           
## [13] "ticket"

t.age.nc<-t.age.nc[,-8]
colnames(t.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Age"          "Dependancies" "Fare"        
##  [9] "Cabin"        "Embarked"     "Q"            "ticket"

t.age.nc$ticket<-as.factor(t.age.nc$ticket)
str(t.age.nc)

## 'data.frame':    714 obs. of  12 variables:
##  $ PassengerId : int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 8 13 13 9 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age         : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ Dependancies: int  1 1 0 1 0 0 4 2 1 2 ...
##  $ Fare        : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin       : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 131 1 1 1 147 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...

#Cabin
t.age.nc$cabin<-if_else(t.age.nc$Cabin=="",0,1)
t.age.nc$cabin<-as.factor(t.age.nc$cabin)
colnames(t.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Age"          "Dependancies" "Fare"        
##  [9] "Cabin"        "Embarked"     "Q"            "ticket"      
## [13] "cabin"

t.age.nc<-t.age.nc[,-9]
str(t.age.nc)

## 'data.frame':    714 obs. of  12 variables:
##  $ PassengerId : int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 8 13 13 9 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age         : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ Dependancies: int  1 1 0 1 0 0 4 2 1 2 ...
##  $ Fare        : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
##  $ cabin       : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 1 1 2 ...

#Title
summary(t.age.nc$Title)

##          Capt           Col           Don            Dr      Jonkheer 
##             1             2             1             6             1 
##          Lady         Major        Master          Miss          Mlle 
##             1             2            36           146             2 
##           Mme            Mr           Mrs            Ms           Rev 
##             1           398           108             1             6 
##           Sir  the Countess 
##             1             1

class(t.age.nc$Title)

## [1] "factor"

t.age.nc$title<-if_else(t.age.nc$Title==" Mr",
                        "Mr",
                        if_else(t.age.nc$Title==" Mrs", 
                                "Mrs",
                                if_else(t.age.nc$Title==" Miss", 
                                        "Miss",
                                        if_else(t.age.nc$Title==" Master",
                                                "Master", "Others"))))
t.age.nc$title<-as.factor(t.age.nc$title)
summary(t.age.nc$title)

## Master   Miss     Mr    Mrs Others 
##     36    146    398    108     26

colnames(t.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Age"          "Dependancies" "Fare"        
##  [9] "Embarked"     "Q"            "ticket"       "cabin"       
## [13] "title"

t.age.nc<-t.age.nc[,-4]
str(t.age.nc)

## 'data.frame':    714 obs. of  12 variables:
##  $ PassengerId : int  1 2 3 4 5 7 8 9 10 11 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age         : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ Dependancies: int  1 1 0 1 0 0 4 2 1 2 ...
##  $ Fare        : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
##  $ cabin       : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 1 1 2 ...
##  $ title       : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 1 4 4 2 ...

#Data for modelling
d<-t.age.nc[,-c(1,9)]
str(d)

## 'data.frame':    714 obs. of  10 variables:
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
##  $ Age         : num  22 38 26 35 35 54 2 27 14 4 ...
##  $ Dependancies: int  1 1 0 1 0 0 4 2 1 2 ...
##  $ Fare        : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
##  $ cabin       : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 1 1 2 ...
##  $ title       : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 1 4 4 2 ...

#Splitting the data
data<-d
set.seed(18)
train <- sample(nrow(data), .8*nrow(data), replace = FALSE)
TrainSet <- data[train,]
ValidSet <- data[-train,]

#Tuning parameters
gbmGrid <-  expand.grid(interaction.depth = c(1, 3, 6, 9, 10),
                        n.trees = (0:50)*50, 
                        shrinkage = seq(.0005, .05,.0005),
                        n.minobsinnode = 10)

fitControl <- trainControl(method = "repeatedcv",
                           repeats = 5,
                           classProbs = TRUE)


TrainSet$Survived<-make.names(TrainSet$Survived)
set.seed(142)

system.time(gbm.ada.1 <- caret::train(Survived ~ ., 
                                      data = TrainSet ,
                                      method = "gbm", 
                                      trControl = fitControl,
                                      verbose = FALSE,
                                      metric = "Kappa"))

##    user  system elapsed 
##   13.14    0.13   14.86

gbm.ada.1

## Stochastic Gradient Boosting 
## 
## 571 samples
##   9 predictor
##   2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 514, 514, 514, 514, 515, 513, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8113221  0.6020723
##   1                  100      0.8113216  0.6034435
##   1                  150      0.8095731  0.5990527
##   2                   50      0.8148550  0.6066429
##   2                  100      0.8134818  0.6036922
##   2                  150      0.8149350  0.6075714
##   3                   50      0.8103427  0.5964966
##   3                  100      0.8106998  0.5982891
##   3                  150      0.8034036  0.5824405
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.

pred<-predict(gbm.ada.1,ValidSet)
t<-table(pred,ValidSet$Survived)
t

##     
## pred  0  1
##   X0 72 15
##   X1  9 47

#Plots of data WITHOUT age

ggplot(data=t.wo.age, aes(t.wo.age$Pclass, fill=t.wo.age$Survived)) +
  geom_bar()

ggplot(data=t.wo.age, aes(t.wo.age$Sex, fill=t.wo.age$Survived)) +
  geom_bar()

ggplot(data=t.wo.age, aes(t.wo.age$Dependancies, fill=t.wo.age$Survived)) +
  geom_bar()

ggplot(data=t.wo.age, aes(t.wo.age$Fare, fill=t.wo.age$Survived)) +
  geom_histogram(breaks=seq(0,300, by=25))

ggplot(data=t.wo.age, aes(t.wo.age$Embarked, fill=t.wo.age$Survived)) +
  geom_bar()

ggplot(t.wo.age,
       aes(y = Q,
           axis1 = Sex, axis2 = Pclass, axis3 = Survived)) +
  geom_alluvium(aes(fill = Survived)) +
  geom_stratum(width = 1/3) +
  geom_text(stat = "stratum", label.strata = TRUE) +
  scale_x_continuous(breaks = 1:3, labels = c("Gender", "Class", "Survived")) +
  ggtitle("Titanic survival by class and sex")

#Check multi-collinearity, distributions etc.
str(t.wo.age)

## 'data.frame':    177 obs. of  13 variables:
##  $ PassengerId : int  6 18 20 27 29 30 32 33 37 43 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
##  $ SibSp       : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Parch       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Dependancies: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Ticket      : Factor w/ 681 levels "110152","110413",..: 276 152 185 180 284 363 587 289 203 392 ...
##  $ Fare        : num  8.46 13 7.22 7.22 7.88 ...
##  $ Cabin       : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 1 1 43 1 1 1 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...

ggpairs(t.wo.age[,c(2,3,5,6,7,8,10,12)])

#Remove SibSp & Parch
colnames(t.wo.age)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "SibSp"        "Parch"        "Dependancies"
##  [9] "Ticket"       "Fare"         "Cabin"        "Embarked"    
## [13] "Q"

t.wo.age.nc<-t.wo.age[,-c(6,7)]              
str(t.wo.age.nc)

## 'data.frame':    177 obs. of  11 variables:
##  $ PassengerId : int  6 18 20 27 29 30 32 33 37 43 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
##  $ Dependancies: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Ticket      : Factor w/ 681 levels "110152","110413",..: 276 152 185 180 284 363 587 289 203 392 ...
##  $ Fare        : num  8.46 13 7.22 7.22 7.88 ...
##  $ Cabin       : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 1 1 43 1 1 1 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...

#Feature Engineering
#Ticket
summary(t.wo.age.nc$Ticket)

##   CA. 2343       4133       1601     239853     371110       2661 
##          7          4          3          3          3          2 
##       2668     367226     367230     370365     376564 W./C. 6607 
##          2          2          2          2          2          2 
##     110465     111427     112052     112058     112379     113028 
##          1          1          1          1          1          1 
##     113056     113505     113510     113767     113796     113798 
##          1          1          1          1          1          1 
##      11774      12460      14311      14312      14313      16988 
##          1          1          1          1          1          1 
##      17421      17453      17464      19947      19988      19996 
##          1          1          1          1          1          1 
##     226593     239854     239855     239856     244373     248727 
##          1          1          1          1          1          1 
##       2624       2626       2627       2629       2631       2641 
##          1          1          1          1          1          1 
##       2647       2649       2662       2664       2665       2671 
##          1          1          1          1          1          1 
##       2674       2677       2678       2686       2689       2700 
##          1          1          1          1          1          1 
##     312991     312993     315037     323592     330877     330909 
##          1          1          1          1          1          1 
##     330919     330931     330932     330935     330959     330979 
##          1          1          1          1          1          1 
##     330980     334912     335677       3411     343095     345777 
##          1          1          1          1          1          1 
##     349201     349208     349214     349215     349216     349217 
##          1          1          1          1          1          1 
##     349218     349221     349222     349223     349225     349227 
##          1          1          1          1          1          1 
##     349234     349253     349254      35852     358585      36209 
##          1          1          1          1          1          1 
##     362316     364498     364848    (Other) 
##          1          1          1         56

temp<-t.wo.age$Ticket
y <- gsub("\\d+", "", temp)
y<-data.frame(y)
y$ticket<-if_else(y$y=="","Numeric","Alpha Numeric")
t.wo.age.nc$ticket<-y$ticket
colnames(t.wo.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Dependancies" "Ticket"       "Fare"        
##  [9] "Cabin"        "Embarked"     "Q"            "ticket"

t.wo.age.nc<-t.wo.age.nc[,-7]
colnames(t.wo.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Dependancies" "Fare"         "Cabin"       
##  [9] "Embarked"     "Q"            "ticket"

t.wo.age.nc$ticket<-as.factor(t.wo.age.nc$ticket)
str(t.wo.age.nc)

## 'data.frame':    177 obs. of  11 variables:
##  $ PassengerId : int  6 18 20 27 29 30 32 33 37 43 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
##  $ Dependancies: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Fare        : num  8.46 13 7.22 7.22 7.88 ...
##  $ Cabin       : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 1 1 43 1 1 1 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...

#Cabin
t.wo.age.nc$cabin<-if_else(t.wo.age.nc$Cabin=="",0,1)
t.wo.age.nc$cabin<-as.factor(t.wo.age.nc$cabin)
colnames(t.wo.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Dependancies" "Fare"         "Cabin"       
##  [9] "Embarked"     "Q"            "ticket"       "cabin"

t.wo.age.nc<-t.wo.age.nc[,-8]
str(t.wo.age.nc)

## 'data.frame':    177 obs. of  11 variables:
##  $ PassengerId : int  6 18 20 27 29 30 32 33 37 43 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
##  $ Title       : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
##  $ Dependancies: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Fare        : num  8.46 13 7.22 7.22 7.88 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
##  $ cabin       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...

#Title
summary(t.wo.age.nc$Title)

##          Capt           Col           Don            Dr      Jonkheer 
##             0             0             0             1             0 
##          Lady         Major        Master          Miss          Mlle 
##             0             0             4            36             0 
##           Mme            Mr           Mrs            Ms           Rev 
##             0           119            17             0             0 
##           Sir  the Countess 
##             0             0

class(t.wo.age.nc$Title)

## [1] "factor"

t.wo.age.nc$title<-if_else(t.wo.age.nc$Title==" Mr",
                           "Mr",
                           if_else(t.wo.age.nc$Title==" Mrs", 
                                   "Mrs",
                                   if_else(t.wo.age.nc$Title==" Miss", 
                                           "Miss",
                                           if_else(t.wo.age.nc$Title==" Master",
                                                   "Master", "Others"))))
t.wo.age.nc$title<-as.factor(t.wo.age.nc$title)
summary(t.wo.age.nc$title)

## Master   Miss     Mr    Mrs Others 
##      4     36    119     17      1

colnames(t.wo.age.nc)

##  [1] "PassengerId"  "Survived"     "Pclass"       "Title"       
##  [5] "Sex"          "Dependancies" "Fare"         "Embarked"    
##  [9] "Q"            "ticket"       "cabin"        "title"

t.wo.age.nc<-t.wo.age.nc[,-4]
str(t.wo.age.nc)

## 'data.frame':    177 obs. of  11 variables:
##  $ PassengerId : int  6 18 20 27 29 30 32 33 37 43 ...
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
##  $ Dependancies: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Fare        : num  8.46 13 7.22 7.22 7.88 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
##  $ Q           : num  1 1 1 1 1 1 1 1 1 1 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
##  $ cabin       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
##  $ title       : Factor w/ 5 levels "Master","Miss",..: 3 3 4 3 2 3 4 2 3 3 ...

#Data for modelling
d<-t.wo.age.nc[,-c(1,8)]
str(d)

## 'data.frame':    177 obs. of  9 variables:
##  $ Survived    : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
##  $ Pclass      : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
##  $ Sex         : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
##  $ Dependancies: int  0 0 0 0 0 0 1 0 0 0 ...
##  $ Fare        : num  8.46 13 7.22 7.22 7.88 ...
##  $ Embarked    : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
##  $ ticket      : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
##  $ cabin       : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
##  $ title       : Factor w/ 5 levels "Master","Miss",..: 3 3 4 3 2 3 4 2 3 3 ...

#Splitting the data
data<-d
set.seed(18)
train <- sample(nrow(data), .8*nrow(data), replace = FALSE)
TrainSet <- data[train,]
ValidSet <- data[-train,]

#Tuning parameters
gbmGrid <-  expand.grid(interaction.depth = c(1, 3, 6, 9, 10),
                        n.trees = (0:50)*50, 
                        shrinkage = seq(.0005, .05,.0005),
                        n.minobsinnode = 10)

fitControl <- trainControl(method = "repeatedcv",
                           repeats = 5,
                           classProbs = TRUE)


TrainSet$Survived<-make.names(TrainSet$Survived)
set.seed(862)

system.time(gbm.ada.1 <- caret::train(Survived ~ ., 
                                      data = TrainSet ,
                                      method = "gbm", 
                                      trControl = fitControl,
                                      verbose = FALSE,
                                      metric = "Kappa"))

##    user  system elapsed 
##    4.73    0.08    5.02

gbm.ada.1

## Stochastic Gradient Boosting 
## 
## 141 samples
##   8 predictor
##   2 classes: 'X0', 'X1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times) 
## Summary of sample sizes: 126, 126, 128, 127, 127, 127, ... 
## Resampling results across tuning parameters:
## 
##   interaction.depth  n.trees  Accuracy   Kappa    
##   1                   50      0.8209158  0.5417118
##   1                  100      0.8365495  0.5778723
##   1                  150      0.8407399  0.5873141
##   2                   50      0.8295018  0.5648355
##   2                  100      0.8183590  0.5405638
##   2                  150      0.8227399  0.5548470
##   3                   50      0.8211062  0.5425349
##   3                  100      0.8183590  0.5399446
##   3                  150      0.8127253  0.5365311
## 
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
## 
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
##  interaction.depth = 1, shrinkage = 0.1 and n.minobsinnode = 10.

pred<-predict(gbm.ada.1,ValidSet)
t<-table(pred,ValidSet$Survived)
t

##     
## pred  0  1
##   X0 24  3
##   X1  2  7

Titanic

Anirban Shaw

January 22, 2019