summary(t)
## PassengerId Survived Pclass Title
## Min. : 1.0 Min. :0.0000 Min. :1.000 Mr :517
## 1st Qu.:223.5 1st Qu.:0.0000 1st Qu.:2.000 Miss :182
## Median :446.0 Median :0.0000 Median :3.000 Mrs :125
## Mean :446.0 Mean :0.3838 Mean :2.309 Master: 40
## 3rd Qu.:668.5 3rd Qu.:1.0000 3rd Qu.:3.000 Dr : 7
## Max. :891.0 Max. :1.0000 Max. :3.000 Rev : 6
## (Other): 14
## Sex Age SibSp Parch
## female:314 Min. : 0.42 Min. :0.000 Min. :0.0000
## male :577 1st Qu.:20.12 1st Qu.:0.000 1st Qu.:0.0000
## Median :28.00 Median :0.000 Median :0.0000
## Mean :29.70 Mean :0.523 Mean :0.3816
## 3rd Qu.:38.00 3rd Qu.:1.000 3rd Qu.:0.0000
## Max. :80.00 Max. :8.000 Max. :6.0000
## NA's :177
## Dependancies Ticket Fare Cabin
## Min. : 0.0000 1601 : 7 Min. : 0.00 :687
## 1st Qu.: 0.0000 347082 : 7 1st Qu.: 7.91 B96 B98 : 4
## Median : 0.0000 CA. 2343: 7 Median : 14.45 C23 C25 C27: 4
## Mean : 0.9046 3101295 : 6 Mean : 32.20 G6 : 4
## 3rd Qu.: 1.0000 347088 : 6 3rd Qu.: 31.00 C22 C26 : 3
## Max. :10.0000 CA 2144 : 6 Max. :512.33 D : 3
## (Other) :852 (Other) :186
## Embarked
## : 2
## C:168
## Q: 77
## S:644
##
##
##
#Add column with quantity for alluvial graphs
t<- t %>%
mutate(Q=1)
#Rectify data types
str(t)
## 'data.frame': 891 obs. of 14 variables:
## $ PassengerId : int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 12 8 13 13 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Dependancies: int 1 1 0 1 0 0 0 4 2 1 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
t$Survived<-as.factor(t$Survived)
t$Pclass<-as.factor(t$Pclass)
#2 sets, 1 with age and 1 without age
t.age<-na.omit(t)
t.wo.age<-t[which(is.na(t$Age)),]
#Remove Age column for dataset without age
colnames(t.wo.age)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Age" "SibSp" "Parch"
## [9] "Dependancies" "Ticket" "Fare" "Cabin"
## [13] "Embarked" "Q"
t.wo.age<-t.wo.age[,-6]
#Plots of data with age
ggplot(data=t.age, aes(t.age$Age, fill=t.age$Survived)) +
geom_histogram(breaks=seq(0, 50, by=3))

ggplot(data=t.age, aes(t.age$Pclass, fill=t.age$Survived)) +
geom_bar()

ggplot(data=t.age, aes(t.age$Sex, fill=t.age$Survived)) +
geom_bar()

ggplot(data=t.age, aes(t.age$Dependancies, fill=t.age$Survived)) +
geom_bar()

ggplot(data=t.age, aes(t.age$Fare, fill=t.age$Survived)) +
geom_histogram(breaks=seq(0,300, by=25))

ggplot(data=t.age, aes(t.age$Embarked, fill=t.age$Survived)) +
geom_bar()

ggplot(t.age,
aes(y = Q,
axis1 = Sex, axis2 = Pclass, axis3 = Survived)) +
geom_alluvium(aes(fill = Survived)) +
geom_stratum(width = 1/3) +
geom_text(stat = "stratum", label.strata = TRUE) +
scale_x_continuous(breaks = 1:3, labels = c("Gender", "Class", "Survived")) +
ggtitle("Titanic survival by class and sex")

#Check multi-collinearity, distributions etc.
ggpairs(t.age[,c(2,3,5,6,7,8,9,11)])

#Remove SibSp & Parch
colnames(t.age)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Age" "SibSp" "Parch"
## [9] "Dependancies" "Ticket" "Fare" "Cabin"
## [13] "Embarked" "Q"
t.age.nc<-t.age[,-c(7,8)]
str(t.age.nc)
## 'data.frame': 714 obs. of 12 variables:
## $ PassengerId : int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 8 13 13 9 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ Dependancies: int 1 1 0 1 0 0 4 2 1 2 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 86 396 345 133 617 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 131 1 1 1 147 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
#Feature Engineering
#Ticket
summary(t.age.nc$Ticket)
## 347082 3101295 347088 CA 2144 382652
## 7 6 6 6 5
## S.O.C. 14879 113760 113781 1601 19950
## 5 4 4 4 4
## 2666 347077 349909 LINE W./C. 6608
## 4 4 4 4 4
## 110152 110413 13502 17421 230080
## 3 3 3 3 3
## 24160 29106 345773 347742 35273
## 3 3 3 3 3
## 363291 C.A. 31921 C.A. 34651 F.C.C. 13529 PC 17572
## 3 3 3 3 3
## PC 17582 PC 17755 PC 17757 PC 17760 SC/Paris 2123
## 3 3 3 3 3
## 111361 113572 113776 113789 113803
## 2 2 2 2 2
## 113806 11668 11751 11767 11967
## 2 2 2 2 2
## 12749 13507 16966 17474 19877
## 2 2 2 2 2
## 19928 19943 220845 230136 230433
## 2 2 2 2 2
## 231919 237736 239865 243847 244252
## 2 2 2 2 2
## 244367 248727 248738 250644 250647
## 2 2 2 2 2
## 250649 250655 26360 2651 2653
## 2 2 2 2 2
## 2659 2691 2699 28403 2908
## 2 2 2 2 2
## 29750 3101278 31027 345764 347054
## 2 2 2 2 2
## 347080 349237 35281 364516 364849
## 2 2 2 2 2
## 36928 36947 36973 370129 392096
## 2 2 2 2 2
## 54636 7534 A/4 48871 A/5. 3336 C.A. 2315
## 2 2 2 2 2
## C.A. 2673 C.A. 33112 C.A. 37671 P/PP 3381 (Other)
## 2 2 2 2 455
temp<-t.age$Ticket
y <- gsub("\\d+", "", temp)
y<-data.frame(y)
y$ticket<-if_else(y$y=="","Numeric","Alpha Numeric")
t.age.nc$ticket<-y$ticket
colnames(t.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Age" "Dependancies" "Ticket"
## [9] "Fare" "Cabin" "Embarked" "Q"
## [13] "ticket"
t.age.nc<-t.age.nc[,-8]
colnames(t.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Age" "Dependancies" "Fare"
## [9] "Cabin" "Embarked" "Q" "ticket"
t.age.nc$ticket<-as.factor(t.age.nc$ticket)
str(t.age.nc)
## 'data.frame': 714 obs. of 12 variables:
## $ PassengerId : int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 8 13 13 9 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ Dependancies: int 1 1 0 1 0 0 4 2 1 2 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 131 1 1 1 147 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
#Cabin
t.age.nc$cabin<-if_else(t.age.nc$Cabin=="",0,1)
t.age.nc$cabin<-as.factor(t.age.nc$cabin)
colnames(t.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Age" "Dependancies" "Fare"
## [9] "Cabin" "Embarked" "Q" "ticket"
## [13] "cabin"
t.age.nc<-t.age.nc[,-9]
str(t.age.nc)
## 'data.frame': 714 obs. of 12 variables:
## $ PassengerId : int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 13 9 13 12 12 8 13 13 9 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ Dependancies: int 1 1 0 1 0 0 4 2 1 2 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
## $ cabin : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 1 1 2 ...
#Title
summary(t.age.nc$Title)
## Capt Col Don Dr Jonkheer
## 1 2 1 6 1
## Lady Major Master Miss Mlle
## 1 2 36 146 2
## Mme Mr Mrs Ms Rev
## 1 398 108 1 6
## Sir the Countess
## 1 1
class(t.age.nc$Title)
## [1] "factor"
t.age.nc$title<-if_else(t.age.nc$Title==" Mr",
"Mr",
if_else(t.age.nc$Title==" Mrs",
"Mrs",
if_else(t.age.nc$Title==" Miss",
"Miss",
if_else(t.age.nc$Title==" Master",
"Master", "Others"))))
t.age.nc$title<-as.factor(t.age.nc$title)
summary(t.age.nc$title)
## Master Miss Mr Mrs Others
## 36 146 398 108 26
colnames(t.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Age" "Dependancies" "Fare"
## [9] "Embarked" "Q" "ticket" "cabin"
## [13] "title"
t.age.nc<-t.age.nc[,-4]
str(t.age.nc)
## 'data.frame': 714 obs. of 12 variables:
## $ PassengerId : int 1 2 3 4 5 7 8 9 10 11 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ Dependancies: int 1 1 0 1 0 0 4 2 1 2 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
## $ cabin : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 1 1 2 ...
## $ title : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 1 4 4 2 ...
#Data for modelling
d<-t.age.nc[,-c(1,9)]
str(d)
## 'data.frame': 714 obs. of 10 variables:
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 2 1 1 1 2 2 2 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 1 3 1 3 1 3 3 2 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 1 1 1 ...
## $ Age : num 22 38 26 35 35 54 2 27 14 4 ...
## $ Dependancies: int 1 1 0 1 0 0 4 2 1 2 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 4 4 4 2 4 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 1 1 1 2 2 2 2 2 2 1 ...
## $ cabin : Factor w/ 2 levels "0","1": 1 2 1 2 1 2 1 1 1 2 ...
## $ title : Factor w/ 5 levels "Master","Miss",..: 3 4 2 4 3 3 1 4 4 2 ...
#Splitting the data
data<-d
set.seed(18)
train <- sample(nrow(data), .8*nrow(data), replace = FALSE)
TrainSet <- data[train,]
ValidSet <- data[-train,]
#Tuning parameters
gbmGrid <- expand.grid(interaction.depth = c(1, 3, 6, 9, 10),
n.trees = (0:50)*50,
shrinkage = seq(.0005, .05,.0005),
n.minobsinnode = 10)
fitControl <- trainControl(method = "repeatedcv",
repeats = 5,
classProbs = TRUE)
TrainSet$Survived<-make.names(TrainSet$Survived)
set.seed(142)
system.time(gbm.ada.1 <- caret::train(Survived ~ .,
data = TrainSet ,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
metric = "Kappa"))
## user system elapsed
## 13.14 0.13 14.86
gbm.ada.1
## Stochastic Gradient Boosting
##
## 571 samples
## 9 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 514, 514, 514, 514, 515, 513, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8113221 0.6020723
## 1 100 0.8113216 0.6034435
## 1 150 0.8095731 0.5990527
## 2 50 0.8148550 0.6066429
## 2 100 0.8134818 0.6036922
## 2 150 0.8149350 0.6075714
## 3 50 0.8103427 0.5964966
## 3 100 0.8106998 0.5982891
## 3 150 0.8034036 0.5824405
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 2, shrinkage = 0.1 and n.minobsinnode = 10.
pred<-predict(gbm.ada.1,ValidSet)
t<-table(pred,ValidSet$Survived)
t
##
## pred 0 1
## X0 72 15
## X1 9 47
#Plots of data WITHOUT age
ggplot(data=t.wo.age, aes(t.wo.age$Pclass, fill=t.wo.age$Survived)) +
geom_bar()

ggplot(data=t.wo.age, aes(t.wo.age$Sex, fill=t.wo.age$Survived)) +
geom_bar()

ggplot(data=t.wo.age, aes(t.wo.age$Dependancies, fill=t.wo.age$Survived)) +
geom_bar()

ggplot(data=t.wo.age, aes(t.wo.age$Fare, fill=t.wo.age$Survived)) +
geom_histogram(breaks=seq(0,300, by=25))

ggplot(data=t.wo.age, aes(t.wo.age$Embarked, fill=t.wo.age$Survived)) +
geom_bar()

ggplot(t.wo.age,
aes(y = Q,
axis1 = Sex, axis2 = Pclass, axis3 = Survived)) +
geom_alluvium(aes(fill = Survived)) +
geom_stratum(width = 1/3) +
geom_text(stat = "stratum", label.strata = TRUE) +
scale_x_continuous(breaks = 1:3, labels = c("Gender", "Class", "Survived")) +
ggtitle("Titanic survival by class and sex")

#Check multi-collinearity, distributions etc.
str(t.wo.age)
## 'data.frame': 177 obs. of 13 variables:
## $ PassengerId : int 6 18 20 27 29 30 32 33 37 43 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
## $ SibSp : int 0 0 0 0 0 0 1 0 0 0 ...
## $ Parch : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Dependancies: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 276 152 185 180 284 363 587 289 203 392 ...
## $ Fare : num 8.46 13 7.22 7.22 7.88 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 1 1 43 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
ggpairs(t.wo.age[,c(2,3,5,6,7,8,10,12)])

#Remove SibSp & Parch
colnames(t.wo.age)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "SibSp" "Parch" "Dependancies"
## [9] "Ticket" "Fare" "Cabin" "Embarked"
## [13] "Q"
t.wo.age.nc<-t.wo.age[,-c(6,7)]
str(t.wo.age.nc)
## 'data.frame': 177 obs. of 11 variables:
## $ PassengerId : int 6 18 20 27 29 30 32 33 37 43 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
## $ Dependancies: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 276 152 185 180 284 363 587 289 203 392 ...
## $ Fare : num 8.46 13 7.22 7.22 7.88 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 1 1 43 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
#Feature Engineering
#Ticket
summary(t.wo.age.nc$Ticket)
## CA. 2343 4133 1601 239853 371110 2661
## 7 4 3 3 3 2
## 2668 367226 367230 370365 376564 W./C. 6607
## 2 2 2 2 2 2
## 110465 111427 112052 112058 112379 113028
## 1 1 1 1 1 1
## 113056 113505 113510 113767 113796 113798
## 1 1 1 1 1 1
## 11774 12460 14311 14312 14313 16988
## 1 1 1 1 1 1
## 17421 17453 17464 19947 19988 19996
## 1 1 1 1 1 1
## 226593 239854 239855 239856 244373 248727
## 1 1 1 1 1 1
## 2624 2626 2627 2629 2631 2641
## 1 1 1 1 1 1
## 2647 2649 2662 2664 2665 2671
## 1 1 1 1 1 1
## 2674 2677 2678 2686 2689 2700
## 1 1 1 1 1 1
## 312991 312993 315037 323592 330877 330909
## 1 1 1 1 1 1
## 330919 330931 330932 330935 330959 330979
## 1 1 1 1 1 1
## 330980 334912 335677 3411 343095 345777
## 1 1 1 1 1 1
## 349201 349208 349214 349215 349216 349217
## 1 1 1 1 1 1
## 349218 349221 349222 349223 349225 349227
## 1 1 1 1 1 1
## 349234 349253 349254 35852 358585 36209
## 1 1 1 1 1 1
## 362316 364498 364848 (Other)
## 1 1 1 56
temp<-t.wo.age$Ticket
y <- gsub("\\d+", "", temp)
y<-data.frame(y)
y$ticket<-if_else(y$y=="","Numeric","Alpha Numeric")
t.wo.age.nc$ticket<-y$ticket
colnames(t.wo.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Dependancies" "Ticket" "Fare"
## [9] "Cabin" "Embarked" "Q" "ticket"
t.wo.age.nc<-t.wo.age.nc[,-7]
colnames(t.wo.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Dependancies" "Fare" "Cabin"
## [9] "Embarked" "Q" "ticket"
t.wo.age.nc$ticket<-as.factor(t.wo.age.nc$ticket)
str(t.wo.age.nc)
## 'data.frame': 177 obs. of 11 variables:
## $ PassengerId : int 6 18 20 27 29 30 32 33 37 43 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
## $ Dependancies: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Fare : num 8.46 13 7.22 7.22 7.88 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 1 1 1 1 1 43 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
#Cabin
t.wo.age.nc$cabin<-if_else(t.wo.age.nc$Cabin=="",0,1)
t.wo.age.nc$cabin<-as.factor(t.wo.age.nc$cabin)
colnames(t.wo.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Dependancies" "Fare" "Cabin"
## [9] "Embarked" "Q" "ticket" "cabin"
t.wo.age.nc<-t.wo.age.nc[,-8]
str(t.wo.age.nc)
## 'data.frame': 177 obs. of 11 variables:
## $ PassengerId : int 6 18 20 27 29 30 32 33 37 43 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
## $ Title : Factor w/ 17 levels " Capt"," Col",..: 12 12 13 12 9 12 13 9 12 12 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
## $ Dependancies: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Fare : num 8.46 13 7.22 7.22 7.88 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
## $ cabin : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
#Title
summary(t.wo.age.nc$Title)
## Capt Col Don Dr Jonkheer
## 0 0 0 1 0
## Lady Major Master Miss Mlle
## 0 0 4 36 0
## Mme Mr Mrs Ms Rev
## 0 119 17 0 0
## Sir the Countess
## 0 0
class(t.wo.age.nc$Title)
## [1] "factor"
t.wo.age.nc$title<-if_else(t.wo.age.nc$Title==" Mr",
"Mr",
if_else(t.wo.age.nc$Title==" Mrs",
"Mrs",
if_else(t.wo.age.nc$Title==" Miss",
"Miss",
if_else(t.wo.age.nc$Title==" Master",
"Master", "Others"))))
t.wo.age.nc$title<-as.factor(t.wo.age.nc$title)
summary(t.wo.age.nc$title)
## Master Miss Mr Mrs Others
## 4 36 119 17 1
colnames(t.wo.age.nc)
## [1] "PassengerId" "Survived" "Pclass" "Title"
## [5] "Sex" "Dependancies" "Fare" "Embarked"
## [9] "Q" "ticket" "cabin" "title"
t.wo.age.nc<-t.wo.age.nc[,-4]
str(t.wo.age.nc)
## 'data.frame': 177 obs. of 11 variables:
## $ PassengerId : int 6 18 20 27 29 30 32 33 37 43 ...
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
## $ Dependancies: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Fare : num 8.46 13 7.22 7.22 7.88 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
## $ Q : num 1 1 1 1 1 1 1 1 1 1 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
## $ cabin : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ title : Factor w/ 5 levels "Master","Miss",..: 3 3 4 3 2 3 4 2 3 3 ...
#Data for modelling
d<-t.wo.age.nc[,-c(1,8)]
str(d)
## 'data.frame': 177 obs. of 9 variables:
## $ Survived : Factor w/ 2 levels "0","1": 1 2 2 1 2 1 2 2 2 1 ...
## $ Pclass : Factor w/ 3 levels "1","2","3": 3 2 3 3 3 3 1 3 3 3 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 2 1 2 1 2 1 1 2 2 ...
## $ Dependancies: int 0 0 0 0 0 0 1 0 0 0 ...
## $ Fare : num 8.46 13 7.22 7.22 7.88 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 3 4 2 2 3 4 2 3 2 2 ...
## $ ticket : Factor w/ 2 levels "Alpha Numeric",..: 2 2 2 2 2 2 1 2 2 2 ...
## $ cabin : Factor w/ 2 levels "0","1": 1 1 1 1 1 1 2 1 1 1 ...
## $ title : Factor w/ 5 levels "Master","Miss",..: 3 3 4 3 2 3 4 2 3 3 ...
#Splitting the data
data<-d
set.seed(18)
train <- sample(nrow(data), .8*nrow(data), replace = FALSE)
TrainSet <- data[train,]
ValidSet <- data[-train,]
#Tuning parameters
gbmGrid <- expand.grid(interaction.depth = c(1, 3, 6, 9, 10),
n.trees = (0:50)*50,
shrinkage = seq(.0005, .05,.0005),
n.minobsinnode = 10)
fitControl <- trainControl(method = "repeatedcv",
repeats = 5,
classProbs = TRUE)
TrainSet$Survived<-make.names(TrainSet$Survived)
set.seed(862)
system.time(gbm.ada.1 <- caret::train(Survived ~ .,
data = TrainSet ,
method = "gbm",
trControl = fitControl,
verbose = FALSE,
metric = "Kappa"))
## user system elapsed
## 4.73 0.08 5.02
gbm.ada.1
## Stochastic Gradient Boosting
##
## 141 samples
## 8 predictor
## 2 classes: 'X0', 'X1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 5 times)
## Summary of sample sizes: 126, 126, 128, 127, 127, 127, ...
## Resampling results across tuning parameters:
##
## interaction.depth n.trees Accuracy Kappa
## 1 50 0.8209158 0.5417118
## 1 100 0.8365495 0.5778723
## 1 150 0.8407399 0.5873141
## 2 50 0.8295018 0.5648355
## 2 100 0.8183590 0.5405638
## 2 150 0.8227399 0.5548470
## 3 50 0.8211062 0.5425349
## 3 100 0.8183590 0.5399446
## 3 150 0.8127253 0.5365311
##
## Tuning parameter 'shrinkage' was held constant at a value of 0.1
##
## Tuning parameter 'n.minobsinnode' was held constant at a value of 10
## Kappa was used to select the optimal model using the largest value.
## The final values used for the model were n.trees = 150,
## interaction.depth = 1, shrinkage = 0.1 and n.minobsinnode = 10.
pred<-predict(gbm.ada.1,ValidSet)
t<-table(pred,ValidSet$Survived)
t
##
## pred 0 1
## X0 24 3
## X1 2 7