Titanic: Machine Learning from Disaster

R markdown for providing my approach to the Titanic Competition in Kaggle This kernel includes data loading cleaning and some exploratory work it also contains basic modeling

Exploratory Data

Normalize function Helper

A simple normalize function, which normalizes continuous data to values between 0 and 1 some machine learning models work better with normalize data

##########################
#Function normalize      #
#Input  X  vector        #
#Return Normlize vector  #
#                        #
#                        #
#########################



normalize <- function(x) {
    return ((x - min(x,na.rm =TRUE)) / (max(x,na.rm =TRUE) - min(x,na.rm =TRUE)))
  }

Loading the train and test sets

setwd("E:/Elements/kaggle/Kaggle_Titanic")
#setwd("C:/Users/cbs/Documents/R/Kaggle.Titanic")
train<-read.csv("train.csv")
test<-read.csv("test.csv")
#Create Raw train, test and combined data
#Combined data will be used to do common preprocessing  (test and train)
#Though we will be careful not to allow a leak of a data 
#from train to test data 
Rawtrain<-train
Rawtest<-test
# To combine the train and test - 
#create dummy Survived column (and then remove it)
Rawtest['Survived']<-0
RawData<-rbind(train,Rawtest)
Rawtest<-subset(Rawtest,select = -Survived)

First, Let’s review the data.

#Create a table of the first ten observations
kable(head(Rawtrain)) %>%
  kable_styling(c("striped", "bordered")) %>%
  add_indent(c(1, 3, 5))
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.2500 S
2 1 1 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0 PC 17599 71.2833 C85 C
3 1 3 Heikkinen, Miss. Laina female 26 0 0 STON/O2. 3101282 7.9250 S
4 1 1 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0 113803 53.1000 C123 S
5 0 3 Allen, Mr. William Henry male 35 0 0 373450 8.0500 S
6 0 3 Moran, Mr. James male NA 0 0 330877 8.4583 Q

The label data, what we need to predict is the Survival column 0 - No 1- Yes This set is a classification set.

One of the first steps is to take a look if the labels are not skew and we have a balance classification.

# Using the ggplot Package                         
# Plotting a Bar plot of the Survivles Vs. Death   
p<-ggplot(data=Rawtrain,aes(x=as.factor(Survived)))+ 
  geom_bar(aes(y = (..count..)) , fill = c('red','green'))
p<-p+xlab("Survived: 0 - No , 1 - Yes") + ylab("Count")
p<-p+labs(title="Survives Versus No Survivors counts ")
p<-p+geom_text(aes(y = (..count..),label =   ifelse((..count..)==0,"",
  scales::percent((..count..)/sum(..count..)))), stat="count",colour="black") 
p

There is some skew as the number of Survival is smaller (ratio of 60%:40%) It is not a large skew if we use cross-validation techniques it probably not a big issue

Missing Data

Let’s review how much data we are missing. Note: Some of the string variables have empty data and not null so first, we replace Empty data by NA

#Some of the missing data is NA, and some (mainly strings) is empty data 
#convert empty strings to NA 
Rawtrain<-Rawtrain %>% 
       mutate(Cabin = if_else(as.character(Cabin)=="", NA_character_ ,as.character(Cabin)))

Rawtrain<-Rawtrain %>% 
       mutate(Ticket = if_else(as.character(Ticket)=="", NA_character_ ,as.character(Ticket)))

Rawtrain<-Rawtrain %>% 
       mutate(Embarked = if_else(as.character(Embarked)=="", NA_character_ ,as.character(Embarked)))

Rawtest<-Rawtest %>% 
       mutate(Cabin = if_else(as.character(Cabin)=="", NA_character_ ,as.character(Cabin)))

Rawtest<-Rawtest %>% 
       mutate(Ticket = if_else(as.character(Ticket)=="", NA_character_ ,as.character(Ticket)))

Rawtest<-Rawtest %>% 
       mutate(Embarked = if_else(as.character(Embarked)=="", NA_character_ ,as.character(Embarked)))
#Using the Inspect package show all missing items  
inspect.na(Rawtrain, barplot=TRUE)

##   column_index column_name number_of_NAs ratio_of_NA
## 1            6         Age           177 0.198653199
## 2           11       Cabin           687 0.771043771
## 3           12    Embarked             2 0.002244669
inspect.na(Rawtest, barplot=TRUE)

##   column_index column_name number_of_NAs ratio_of_NA
## 1            5         Age            86 0.205741627
## 2            9        Fare             1 0.002392344
## 3           10       Cabin           327 0.782296651

Age and Cabin has a relatively high amount of missing data.

Pclass

Pclass will be used as a Factor or Categorical variable

#plot Pcalss Versus Death  Survived 
theme_update(plot.title = element_text(hjust = 0.5))
p<-ggplot(Rawtrain, aes((y =..count..),x=Pclass)) + 
  geom_bar(aes(fill = as.factor(Survived)), position = "dodge")
p<-p+xlab("Pclass") + ylab("Count")
p<-p+scale_fill_manual(name="Survived/Death  ",values=c("red", "green"),
                       labels=c("Death", "Survived"))
p<-p+labs(title=" Bar Plot Pclass -- Survived ")
p

There are three class types - 1, 2 and 3. It Seems that in Class 3 you have less chance to survive While in Class 1 you had a higher chance to survive In Class 2 the odd are Equal Convert the Pclass to Factor variable

#Convert to a factor
Rawtest$Pclass<-as.factor(Rawtest$Pclass)
Rawtrain$Pclass<-as.factor(Rawtrain$Pclass)

Sex

Plot Sex Versus Survived Death

#Plot Sex Versus Survived Death 
p<-ggplot(data=Rawtrain,aes(x=as.factor(Survived),
                            fill=Sex))+geom_bar()
p<-p+scale_fill_manual("legend", values =
                         c("male" = "blue", "female" = "red"))
p<-p+xlab("Survived: 0 - No, 1 - Yes") + ylab("Count")
p<-p+labs(title="Survives Vs. Death Based on Sex ")
p<-p+geom_text(aes(label=..count..),
               stat="count",position=position_stack(0.5))
p

Table of Sex Counts

kable(table(Rawtrain$Sex),col.names = c("Sex","Frequency")) %>%
  kable_styling("striped", full_width = F) 
Sex Frequency
female 314
male 577

Checking the Correlation To check Correlation, we create a Categorical variable 1 - Female 0 - Sex

#Using GGpairs draw correlation graphs
Rawtrain<-mutate(Rawtrain,Sexb=ifelse(Sex=="female",1,0))
Rawtest<-mutate(Rawtest,Sexb=ifelse(Sex=="female",1,0))
ggpairs(subset(Rawtrain,select=c("Survived", "Sexb")))

Conclusion : 1. No Missing data or NA 2. There is some correlation between Sex and Survival 2. Sex looks like a good predictor/Feature to use

Age

Plot Age Versus Survived Death

#Plot Age Versus Survived Death 
p<-ggplot(Rawtrain,aes(x=Age,group=as.factor(Survived),
                       fill=as.factor(Survived)))+
  geom_density(alpha = 0.5)
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"),
                       values = c("red","green"))

p
## Warning: Removed 177 rows containing non-finite values (stat_density).

Later we will use the Data from the Sex and names to complete the missing data. Look for outliers and another way to look at the relation between Age and survivals.

#Boxplot Age - Versus Survived or Death
p<-ggplot(Rawtrain, aes(x=as.factor(Survived),y=Age,
                        color = as.factor(Survived))) +   geom_boxplot()
p<-p+scale_color_manual(values=c("red", "green"))
p<-p+xlab("Survived: 0 - No , 1 - Yes") + ylab("Age - Years")
p<-p+labs(title=" Boxplot Of Survivals/Death -- Age")
p
## Warning: Removed 177 rows containing non-finite values (stat_boxplot).

We can see that Age as a continuous feature/Predictor there are points it is a good predictor and some that it doesn’t

(Passenger) Name

In this part, we are doing some simple text analytics Let’s try to understand what type of word the naming feature can provide The step we will take care 1.Build a corpus 2. Tokenize (converting to tokens, words in this case ) Removing numbers, symbols separators, punctuation and some another standard cleaning (though it is probably less needed here ) 3.Create Uni Grahm 4.Sort according to count (Frequency )

#Create Corpus
corp <- corpus(as.character(RawData$Name))
#Tokenize
Text.Sentences <- tokens(
  x = tolower(corp),
  remove_punct = TRUE,
  remove_twitter = TRUE,
  remove_numbers = TRUE,
  remove_hyphens = TRUE,
  remove_symbols = TRUE,
  remove_separators = TRUE,
  remove_url = TRUE)

##############################################
# Create Uni Gram                            #
#                                            #
##############################################
uni_DFM <- dfm(Text.Sentences)
#Trim to Words with Priority higher than 2

#Calculate the Col Sum 
sums_U <- colSums(uni_DFM)

#PAck in Data table 
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)

#Sort 
setorder(uni_words,-count)
#Print Table
kable(head(uni_words,15)) %>%
  kable_styling("striped", full_width = F) %>%
  add_indent(c(1, 3, 5))
word_1 count
mr 763
miss 260
mrs 201
william 87
john 72
master 61
henry 49
charles 40
james 38
george 37
thomas 36
joseph 31
mary 30
edward 28
elizabeth 25

The most interesting results are Mr, Miss, Mrs and Master (Master in the old days is a boy under 18) We can use this information for two purpose

  1. There are 177 passengers that their age is missing, we can use this data to have a better technique to complete missing data (using mean according to the titles)
  2. We can use the information to create a feature married or not a married woman

Based on the results build a new column Mr, Miss, Mrs or Master

#Use the Tokens Vector to build a title column
Text<-sapply(Text.Sentences,as.character)
regex <- 
chkmr<-grepl(paste0(sprintf("(?=.*%s)", "mr"), collapse = ''), Text, perl = TRUE)
chkmiss<-grepl(paste0(sprintf("(?=.*%s)", "miss"), collapse = ''), Text, perl = TRUE)
chkmrs<-grepl(paste0(sprintf("(?=.*%s)", "mrs"), collapse = ''), Text, perl = TRUE)
chkmaster<-grepl(paste0(sprintf("(?=.*%s)", "master"), collapse = ''), Text, perl = TRUE)

RawData<-RawData%>%mutate(Title=ifelse(chkmrs,"Mrs",ifelse(chkmiss,"Miss",ifelse(chkmr,"Mr.",
                                                                                  ifelse(chkmaster,"Master","None")))))

Let’s look at a summary after populating the title.

kable(table(RawData$Title),col.names = c("Title","Frequency")) %>%
  kable_styling("striped", full_width = F) 
Title Frequency
Master 61
Miss 258
Mr. 758
Mrs 201
None 31

Let’s look on the None title items

kable(filter(RawData,Title =="None"))%>%
  kable_styling(c("striped", "bordered"))
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked Title
31 0 1 Uruchurtu, Don. Manuel E male 40 0 0 PC 17601 27.7208 C None
150 0 2 Byles, Rev. Thomas Roussel Davids male 42 0 0 244310 13.0000 S None
151 0 2 Bateman, Rev. Robert James male 51 0 0 S.O.P. 1166 12.5250 S None
246 0 1 Minahan, Dr. William Edward male 44 2 0 19928 90.0000 C78 Q None
250 0 2 Carter, Rev. Ernest Courtenay male 54 1 0 244252 26.0000 S None
318 0 2 Moraweck, Dr. Ernest male 54 0 0 29011 14.0000 S None
370 1 1 Aubart, Mme. Leontine Pauline female 24 0 0 PC 17477 69.3000 B35 C None
399 0 2 Pain, Dr. Alfred male 23 0 0 244278 10.5000 S None
444 1 2 Reynaldo, Ms. Encarnacion female 28 0 0 230434 13.0000 S None
450 1 1 Peuchen, Major. Arthur Godfrey male 52 0 0 113786 30.5000 C104 S None
537 0 1 Butt, Major. Archibald Willingham male 45 0 0 113050 26.5500 B38 S None
627 0 2 Kirkland, Rev. Charles Leonard male 57 0 0 219533 12.3500 Q None
633 1 1 Stahelin-Maeglin, Dr. Max male 32 0 0 13214 30.5000 B50 C None
642 1 1 Sagesser, Mlle. Emma female 24 0 0 PC 17477 69.3000 B35 C None
648 1 1 Simonius-Blumer, Col. Oberst Alfons male 56 0 0 13213 35.5000 A26 C None
661 1 1 Frauenthal, Dr. Henry William male 50 2 0 PC 17611 133.6500 S None
695 0 1 Weir, Col. John male 60 0 0 113800 26.5500 S None
746 0 1 Crosby, Capt. Edward Gifford male 70 1 1 WE/P 5735 71.0000 B22 S None
760 1 1 Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards) female 33 0 0 110152 86.5000 B77 S None
767 0 1 Brewe, Dr. Arthur Jackson male NA 0 0 112379 39.6000 C None
797 1 1 Leader, Dr. Alice (Farnham) female 49 0 0 17465 25.9292 D17 S None
823 0 1 Reuchlin, Jonkheer. John George male 38 0 0 19972 0.0000 S None
849 0 2 Harper, Rev. John male 28 0 1 248727 33.0000 S None
887 0 2 Montvila, Rev. Juozas male 27 0 0 211536 13.0000 S None
980 0 3 O’Donoghue, Ms. Bridget female NA 0 0 364856 7.7500 Q None
1023 0 1 Gracie, Col. Archibald IV male 53 0 0 113780 28.5000 C51 C None
1041 0 2 Lahtinen, Rev. William male 30 1 1 250651 26.0000 S None
1056 0 2 Peruschitz, Rev. Joseph Maria male 41 0 0 237393 13.0000 S None
1094 0 1 Astor, Col. John Jacob male 47 1 0 PC 17757 227.5250 C62 C64 C None
1185 0 1 Dodge, Dr. Washington male 53 1 1 33638 81.8583 A34 S None
1306 0 1 Oliva y Ocana, Dona. Fermina female 39 0 0 PC 17758 108.9000 C105 C None

It looks like we should add the following 1. Dr. - For Doctors 2. Rev - 3. Army - Majo. Col. Capt. 4. Dona and Don will be treated as Mrs and Mr.

chkdr<-grepl(paste0(sprintf("(?=.*%s)", "dr"), collapse = ''), Text, perl = TRUE)
chkrev<-grepl(paste0(sprintf("(?=.*%s)", "rev"), collapse = ''), Text, perl = TRUE)
chkArmy1<-grepl(paste0(sprintf("(?=.*%s)", "major"), collapse = ''), Text, perl = TRUE)
chkArmy2<-grepl(paste0(sprintf("(?=.*%s)", "col"), collapse = ''), Text, perl = TRUE)
chkArmy3<-grepl(paste0(sprintf("(?=.*%s)", "capt"), collapse = ''), Text, perl = TRUE)

RawData<-RawData%>%
  mutate(Title2=ifelse(chkdr ,"Dr.",ifelse(chkrev,"Rev.",ifelse(chkArmy1,"Army",ifelse(chkArmy2,"Army",                                                                                   ifelse(chkArmy3,"Army","None")))))) 

chkdon<-grepl(paste0(sprintf("(?=.*%s)", "don"), collapse = ''), Text, perl = TRUE)
chkdone<-grepl(paste0(sprintf("(?=.*%s)", "dona"), collapse = ''), Text, perl = TRUE)

RawData<-RawData%>%
  mutate(Title3=ifelse(chkdon ,"Mr.",ifelse(chkdone,"Mrs" ,"None")))                                              

Merging the three titles

RawData$Title[RawData$Title == "None"] <- RawData$Title2[RawData$Title == "None"]
RawData$Title[RawData$Title == "None"] <- RawData$Title3[RawData$Title == "None"]
RawData <- subset(RawData, select = -c(Title2,Title3))
Rawtrain[,"Title"]<-RawData[1:nrow(Rawtrain),"Title"]

test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,"Title"]<-RawData[test.index,]$Title

Plot a Survived Death boxplot based on the titles

#Plot a Survived Death boxplot based on the titles 
p<-ggplot(Rawtrain, aes((y =..count..),x=Title)) +
  geom_bar(aes(fill = as.factor(Survived)), position = "dodge")
p<-p+xlab("Title") + ylab("Count")
p<-p+scale_fill_manual(name="Survived/Death  ",
                       values=c("red", "green"), labels=c("Death", "Survived"))
p<-p+labs(title=" Bar Plot Title -- Survived ")
p

Handling Title missing data using Age And Sex

First, let’s see with current data what is the data about Age per Title.

p<-ggplot(Rawtrain, aes(x=as.factor(Title),y=Age,color = as.factor(Title))) +   geom_boxplot()
p<-p+xlab("Title") + ylab("Age - Years")
p<-p+labs(title=" Boxplot Of Title -- Age ")
p
## Warning: Removed 177 rows containing non-finite values (stat_boxplot).

Calculate Mean Median and SD for Age per Title ### Train

TitlesOptions<-c("Mr.","Miss","Mrs","Master","Dr.","Army","Rev.")
for (i in 1:length(TitlesOptions))
{
  df<-subset(Rawtrain,(Title==TitlesOptions[i]))
  Mean<-mean(df$Age,na.rm=TRUE) 
  Median<-median(df$Age,na.rm=TRUE)
  Sdv<-sd(df$Age,na.rm=TRUE)
  
  n = dim((subset(Rawtrain,Title==TitlesOptions[i] & !is.na(Age))))[1]
  Esdv <- sqrt(Sdv^2/n)
  
  if (i==1)
    DFAgeStat<-data.frame("Title" = TitlesOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv , "Esdv" = Esdv  )
  else
  {
    DF<-data.frame("Title" = TitlesOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv , "Esdv" = Esdv )
    DFAgeStat<-rbind(DFAgeStat,DF)
  }

}

#Show the statisticl results
kable((DFAgeStat)) %>%
  kable_styling("striped", full_width = F) 
Title Mean Median Sdv Esdv
Mr. 32.428750 30.0 12.709815 0.6354907
Miss 21.777778 21.0 13.077436 1.0897863
Mrs 35.642857 35.0 11.506251 1.0872386
Master 4.574167 3.5 3.619872 0.6033119
Dr. 42.000000 46.5 12.016655 4.9057789
Army 56.600000 56.0 9.316652 4.1665333
Rev. 43.166667 46.5 13.136463 5.3629387
SexOptions<-c("male","female")
for (i in 1:length(SexOptions))
{
    df<-subset(Rawtrain,(Sex==SexOptions[i]))
    Mean<-mean(df$Age,na.rm=TRUE) 
    Median<-median(df$Age,na.rm=TRUE)
    
   
    
    Sdv<-sd(df$Age,na.rm=TRUE)
    if (i==1)
      DFSexStat<-data.frame("Sex" = SexOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv )
    else
    {
      DF<-data.frame("Sex" = SexOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv )
      DFSexStat<-rbind(DFSexStat,DF)
    }
}


kable((DFSexStat)) %>%
  kable_styling("striped", full_width = F) 
Sex Mean Median Sdv
male 30.72664 29 14.67820
female 27.91571 27 14.11015
DFAgeStat<-setDT(DFAgeStat)
setkey(DFAgeStat,Title,Mean,Median,Sdv)

The logic will be: If we have Title and missing age we will use the mean title to complete If we have the age data but no Title data we can add the Title (based on age and sex) if we only have sex, we will use it to evaluate the age and title

Preparing three approaches for later user 1. Use the mean 2. Use the median 3. Use Random normal distribution variable

Since this is like building a small model, we will use them Mean Median and the Random variables that is computes using the train data and complete the data using the model for both the test and the train sets

Complete Missing Age

#Handle cases where Age is missing, but we have the Title data 
# The 

#Loop creating the three options described above 
TitlesOptions<-c("Mr.","Miss","Mrs","Master","Dr.","Army","Rev.")
Rawtrain<-mutate(Rawtrain,AgeMean=Age)
Rawtrain<-mutate(Rawtrain,AgeMedian=Age)
Rawtrain<-mutate(Rawtrain,AgeNormal=Age)
set.seed(444)
for (i in 1:length(TitlesOptions))
{
  #Mean
  Rawtrain$AgeMean[is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Mean
  #Median
  Rawtrain$AgeMedian[is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Median
  #Normal Distrbution variable based on the Mean and SD 
  
  #Calculate the number of missing items 
  n<-dim(subset(Rawtrain,is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]))[1]
  #Create a Normal Random Variable with Mean  and SD^2/n  and sample according to n
  Rawtrain$AgeNormal[is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]]<-
    sample(rnorm(10000,DFAgeStat[.(TitlesOptions[i])]$Mean,DFAgeStat[.(TitlesOptions[i])]$Esdv),n)   
  
}

#Repeat the same process over the test set 
Rawtest<-mutate(Rawtest,AgeMean=Age)
Rawtest<-mutate(Rawtest,AgeMedian=Age)
Rawtest<-mutate(Rawtest,AgeNormal=Age)
set.seed(444)
for (i in 1:length(TitlesOptions))
{
  #Mean
  Rawtest$AgeMean[is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Mean
  #Median
  Rawtest$AgeMedian[is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Median
  #Normal Distrbution variable based on the Mean and SD 
  
  #Calculate the number of missing items 
  n<-dim(subset(Rawtest,is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]))[1]
  #Create a Normal Random Variable with Mean  and SD^2/n  and sample according to n
  Rawtest$AgeNormal[is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]]<-
    sample(rnorm(10000,DFAgeStat[.(TitlesOptions[i])]$Mean,DFAgeStat[.(TitlesOptions[i])]$Esdv),n)   
  
} 

Finaly let’s compare the 4 new Age tables to verify the results (Train set)

Age Versus Titles - Train

#Plot the Original Age column (this time without the NA values)
p1<-ggplot(subset(Rawtrain,!is.na(Age)), aes(x=as.factor(Title),
                          y=Age,color = as.factor(Title))) +   geom_boxplot()
p1<-p1+xlab("Title") + ylab("Age - Years")
p1<-p1+labs(title=" Boxplot Of Title -- Age ")
p1<-p1+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p1<-p1+theme(axis.text.x = element_text(size=12 ))
p1<-p1+theme(axis.text.y = element_text(size=12 ))
p1<-p1+ labs(col="Title")                          


p2<-ggplot(Rawtrain, aes(x=as.factor(Title),y=AgeMean,color = as.factor(Title))) +   geom_boxplot()
p2<-p2+xlab("Title") + ylab("Age  - Years")
p2<-p2+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Mean  ")
p2<-p2+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p2<-p2+theme(axis.text.x = element_text(size=12 ))
p2<-p2+theme(axis.text.y = element_text(size=12 ))
p2<-p2+ labs(col="Title")  


p3<-ggplot(Rawtrain, aes(x=as.factor(Title),y=AgeMedian,color = as.factor(Title))) +   geom_boxplot()
p3<-p3+xlab("Title") + ylab("Age  - Years")
p3<-p3+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Median  ")
p3<-p3+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p3<-p3+theme(axis.text.x = element_text(size=12 ))
p3<-p3+theme(axis.text.y = element_text(size=12 ))
p3<-p3+ labs(col="Title")  

p4<-ggplot(Rawtrain, aes(x=as.factor(Title),y=AgeNormal,color = as.factor(Title))) +   geom_boxplot()
p4<-p4+xlab("Title") + ylab("Age  - Years")
p4<-p4+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Normal Random Variable  ")
p4<-p4+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p4<-p4+theme(axis.text.x = element_text(size=12 ))
p4<-p4+theme(axis.text.y = element_text(size=12 ))
p4<-p4+ labs(col="Title")  




grid.arrange(p1,p2,p3,p4,nrow =4)

Age Versus Titles - Test

Creating boxplot of Age versus titles 1.Original plot (without the NA) 2.With the method of replacing NA by mean 3. With the technique of replacing Na by median 4. With the approach of replacing NA by Random variable

#Plot the Original Age column (this time without the NA values)
p1<-ggplot(subset(Rawtest,!is.na(Age)), aes(x=as.factor(Title),
                          y=Age,color = as.factor(Title))) +  
  geom_boxplot()
p1<-p1+xlab("Title") + ylab("Age - Years")
p1<-p1+labs(title=" Boxplot Of Title -- Age ")
p1<-p1+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p1<-p1+theme(axis.text.x = element_text(size=12 ))
p1<-p1+theme(axis.text.y = element_text(size=12 ))
p1<-p1+ labs(col="Title")                          


#Plot the Age column (this time with Age mean instead of  the NA values)
p2<-ggplot(Rawtest, aes(x=as.factor(Title),
                        y=AgeMean,color = as.factor(Title))) +   geom_boxplot()
p2<-p2+xlab("Title") + ylab("Age  - Years")
p2<-p2+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Mean  ")
p2<-p2+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p2<-p2+theme(axis.text.x = element_text(size=12 ))
p2<-p2+theme(axis.text.y = element_text(size=12 ))
p2<-p2+ labs(col="Title")  

#Plot the Age column (this time with Age median instead of  the NA values)
p3<-ggplot(Rawtest, aes(x=as.factor(Title),
                        y=AgeMedian,color = as.factor(Title))) +   geom_boxplot()
p3<-p3+xlab("Title") + ylab("Age  - Years")
p3<-p3+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Median  ")
p3<-p3+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p3<-p3+theme(axis.text.x = element_text(size=12 ))
p3<-p3+theme(axis.text.y = element_text(size=12 ))
p3<-p3+ labs(col="Title")  

#Plot the Age column (this time with a normal random variable instead of  the NA values)
p4<-ggplot(Rawtest, aes(x=as.factor(Title),
                        y=AgeNormal,color = as.factor(Title))) +   geom_boxplot()
p4<-p4+xlab("Title") + ylab("Age  - Years")
p4<-p4+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Normal Random Variable  ")
p4<-p4+theme(axis.title.x = element_text(size = rel(1.2)),
             axis.title.y = element_text(size = rel(1.2)),
        legend.text = element_text(size = rel(1.2)),
         axis.text.x.top =  element_text(size = rel(1.2)),
        legend.title = element_text(size = rel(1.2)))

p4<-p4+theme(axis.text.x = element_text(size=12 ))
p4<-p4+theme(axis.text.y = element_text(size=12 ))
p4<-p4+ labs(col="Title")  


grid.arrange(p1,p2,p3,p4,nrow =4)

As we can see in all methods, there is no drastic change (There are more outlier, but this is expected as we had more samples)

Complete Missing Titles

We can complete the missing titles data based on Age and Sex We will choose either Mr., Mrs. Miss or Master

#Subset the missing titles 
res<-Rawtrain[Rawtrain$Title=="None",]

#Complate based on the following
#Male < 18 - Master
#Female <28 - Miss
#Female >=8 Mrs.
#Male >18 Mr.
Rawtrain[Rawtrain$Title=="None",]$Title<-
  ifelse(res$Age<18 & res$Sex=="male","Master",ifelse(res$Age<28 &   
                        res$Sex=="female","Miss",
                        ifelse(res$Sex=="female" & res$Age>=28,"Mrs","Mr.")))

Rawtest[Rawtest$Title=="None",]
##  [1] PassengerId Pclass      Name        Sex         Age        
##  [6] SibSp       Parch       Ticket      Fare        Cabin      
## [11] Embarked    Sexb        Title       AgeMean     AgeMedian  
## [16] AgeNormal  
## <0 rows> (or 0-length row.names)
#Convert to Factors 
Rawtrain$Title<-as.factor(Rawtrain$Title)
Rawtest$Title<-as.factor(Rawtest$Title)

No missing data for the Test set

SibSp & Parch

Sibsp - Number of siblings/spouses aboard the Titanic Parch - Number of parents/children aboard the Titanic

Plot Sibsp Versus Survived Death

#Plot Sibsp Versus Survived Death 
p<-ggplot(Rawtrain,aes(x=as.factor(SibSp),
                       group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_bar()
p<-p+labs(title=" Bar Plot of Sibsp Vs. Survived  ")
p<-p+scale_fill_manual("legend",
                       labels = c("Dead", "Survived"), values = c("red","green"))
p<-p+xlab("Sibsp ") + ylab("Count")
p

Plot Parch Versus Survived Death

#Plot Parch Versus Survived Death 
p<-ggplot(Rawtrain,aes(x=as.factor(Parch),
                       group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_bar()
p<-p+labs(title=" Bar Plot of Parch Vs. Survived ")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"),
                       values = c("red","green"))
p<-p+xlab("Parch ") + ylab("Count")
p

Let’s try to build some categorical parameters based on these two variables Ans see the impact on Survived/Death

First - Is the passengers alone

Rawtrain<-Rawtrain%>%mutate(Isalone=ifelse((SibSp+Parch) > 0,0,1))
Rawtest<-Rawtest%>%mutate(Isalone=ifelse((SibSp+Parch) > 0,0,1))

p<-ggplot(Rawtrain,aes(x=as.factor(Isalone),
                       group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_bar()
p<-p+labs(title=" Bar Plot of Isalone  Vs. Survived ")
p<-p+xlab("Is Alone 1 - Yes  0 - No ") + ylab("Count")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p

Create a column with the total number of people in the group

Rawtrain<-Rawtrain%>%mutate(GroupNo=SibSp+Parch)
Rawtest<-Rawtest%>%mutate(GroupNo=SibSp+Parch)
p<-ggplot(Rawtrain,aes(x=as.factor(GroupNo),
                       group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_bar()
p<-p+labs(title=" Bar Plot of Group size  Vs. Survived  ")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p

It looks like a group of 4 and above has less chance to survive.

#Rawtrain<-Rawtrain%>%mutate(IsLArgeGroup=ifelse(SibSp+Parch>=4,1,0))
#Rawtest<-Rawtest%>%mutate(IsLArgeGroup=ifelse(SibSp+Parch>=4,1,0))

Ticket

Ticket observations contain a set of characters (or nothing) and then numbers The first step clean the ticket observations and create a column with only the character set and a separate column for the number

#Remove all the digits 
#(Since the characters may have numbers also we remove when there is an occurrence of three or more )
RawData<-RawData%>%
mutate(TicketChar = gsub("[0-9]{3,}","",Ticket))

#Mark all empty Ticket Char by U 
RawData$TicketChar<-ifelse(RawData$TicketChar=="","U",RawData$TicketChar)

#Create anothher colomn with the numbers
RawData<-RawData%>%
mutate(TicketNumbers = strtoi(str_extract(Ticket,"[0-9]{3,}")))


#In the majority of the data the above
#condition (3 digits or above in a row )
#is Enough, There are few other cases
#where there are only characters, or there is one digit 
#Handle these cases 
#Extract the Digits and place them in the TicketNumbers
res<-as.numeric((RawData%>%
                   filter(grepl("\\. [0-9]{1}$",Ticket)))$Ticket%>%str_extract_all(" [0-9]{1}$"))
RawData[grep("\\. [0-9]{1}$",RawData$Ticket),]$TicketNumbers<-res
#make sure that all values are numerics 
RawData$TicketNumbers<-
  as.numeric(RawData$TicketNumbers)

#Set number 100 to the Ticketnumbers 
RawData[is.na(RawData$TicketNumbers),]$TicketNumbers<-
  as.integer(mean(RawData$TicketNumbers,na.rm = TRUE))


#Create A normlize TicketNumbers 
RawData<-RawData%>%mutate(TicketNumbers_Nor =normalize(TicketNumbers))

#Extract the Character part to TicketChar
res<-((RawData%>%filter(grepl("\\. [0-9]{1}$",Ticket)))$Ticket%>%str_extract_all("[^0-9 ]{1,}"))
RawData[grep("\\. [0-9]{1}$",RawData$Ticket),]$TicketChar<-res
RawData$TicketChar<-as.character(RawData$TicketChar)

#Handle the cases that there are no digits 
res<-as.character(subset(RawData,grepl("^[A-Za-z]+$",Ticket,perl=T))$Ticket)
RawData[grep("^[A-Za-z]+$",RawData$Ticket,perl=T),]$TicketChar<-res

#Remove dots  spaces and / from the Ticket chars
RawData$TicketChar<-str_replace_all(RawData$TicketChar,"[.]","")
RawData$TicketChar<-str_replace_all(RawData$TicketChar,"[ ]{1,}","")
RawData$TicketChar<-str_replace_all(RawData$TicketChar,"[///]{1,}","")

#Copy the Ticket new columns to the train and test sets
Rawtrain[,c("TicketChar","TicketNumbers","TicketNumbers_Nor")]<-
  RawData[1:nrow(Rawtrain),c("TicketChar","TicketNumbers","TicketNumbers_Nor")]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,c("TicketChar","TicketNumbers","TicketNumbers_Nor")]<-
  RawData[test.index,c("TicketChar","TicketNumbers","TicketNumbers_Nor")]

We need to verify that in the Test set there are no Ticket characters which are missing in the train set if this is the case we replace them with U

#Create a Match matrix 
TrainTicketchr<-sort(unique(Rawtrain$TicketChar))
TestTicketchr<-sort(unique(Rawtest$TicketChar))

MatchTicketChar<-match(TestTicketchr,TrainTicketchr)
#Checkk all NA in the test set 
Missing_Items<-TestTicketchr[which(is.na(MatchTicketChar))]

#Check how many items we are missing 
count<-0
for(i in 1:length(Missing_Items))
{
  count<-count + dim(Rawtest[Rawtest$TicketChar==Missing_Items[i],])[1]
  #Change the Missing Test items to U
  Rawtest[Rawtest$TicketChar==Missing_Items[i],]$TicketChar<-"U"
  
}
cat("Missing items",count)
## Missing items 6
#Create common factor levels 
df<-rbind(Rawtrain[,-2],Rawtest)
df$TicketChar<-as.factor(df$TicketChar)

Rawtrain$TicketChar<-df[1:nrow(Rawtrain),]$TicketChar
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest$TicketChar<-df[test.index,]$TicketChar
#Plot a barplot of the ticket prefix versus Survived or Death (Omit the Unspecified - U)
p<-ggplot(subset(Rawtrain,TicketChar!="U"),
          aes(x=as.factor(TicketChar),
              group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_bar()
p<-p+labs(title=" Bar Plot of TicketChar (Survived Vs. Death) ")
p<-p+theme(axis.text.x=element_text(angle=45, hjust=1))
p<-p+xlab("Ticket Characters")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p

Fare

The fare is a continues parameter There is one observation missing Replace the missing value by the mean

if (sum(is.na(RawData$Fare))>0)
  RawData[is.na(RawData$Fare),]$Fare<-mean(RawData$Fare,na.rm = TRUE)
Rawtrain[,c("Fare")]<-RawData[1:nrow(Rawtrain),c("Fare")]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,c("Fare")]<-RawData[test.index,c("Fare")]

plot the Fare Versus Survived or Death

#plot the Fare Versus Survived or Death 
p<-ggplot(Rawtrain,aes(x=Fare,
                       group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) ")
p<-p+scale_fill_manual("legend",
                       labels = c("Dead", "Survived"), values = c("red","green"))
p

Split Fare to subarea

p<-subset(Rawtrain,Fare<50)%>%
  ggplot(aes(x=Fare,group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) Fare < 50")
p<-p+scale_fill_manual("legend",
                       labels = c("Dead", "Survived"), values = c("red","green"))
p

p<-subset(Rawtrain,Fare>50 & Fare<100)%>%
  ggplot(aes(x=Fare,group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) Fare Between 50 and 100")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p

p<-subset(Rawtrain,Fare>100)%>%
  ggplot(aes(x=Fare,group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) Fare Above 100")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p

Embarked

Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)

#plot Embarked Versus Death Survived 
p<-ggplot(Rawtrain, aes((y =..count..),x=Embarked)) + 
  geom_bar(aes(fill = as.factor(Survived)), position = "dodge")
p<-p+xlab("Embarked") + ylab("Count")
p<-p+scale_fill_manual(name="Survived/Death  ",values=c("red", "green"), 
                       labels=c("Death", "Survived"))
p<-p+labs(title=" Bar Plot Embarked -- Survived ")
p

#There are two missing values 
#Assign U to these values 
RawData$Embarked<-as.character(RawData$Embarked)

if (sum(RawData$Embarked=="")>0)
  RawData[RawData$Embarked=="",]$Embarked<-"U"

RawData$Embarked<-as.factor(RawData$Embarked)


Rawtrain$Embarked<-RawData[1:nrow(Rawtrain),]$Embarked
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest$Embarked<-RawData[test.index,]$Embarked

Cabin

The cabin data has a high percentage of missing data First, let’s split the Cabin to the character and digits. Some of the cabin cells include more than one cabin, but they are adjacent We will use only the first one

RawData<-RawData%>%
  mutate(Cabin_Charcter = str_extract(Cabin,"[A-Za-z][0-9]"))

RawData$Cabin_Charcter<-str_extract(RawData$Cabin_Charcter,"^[A-Za-z]")
#Mark all missing cabin character as U 
RawData[is.na(RawData$Cabin_Charcter),]$Cabin_Charcter<-"U"

#Convert to Factor 
RawData$Cabin_Charcter<-as.factor(RawData$Cabin_Charcter)

#Copy the Cabin Character to the train and test sets 
Rawtrain[,"Cabin_Charcter"]<-RawData[1:nrow(Rawtrain),"Cabin_Charcter"]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,"Cabin_Charcter"]<-RawData[test.index,]$Cabin_Charcter 

Lets see the Survivle/Death Versus the Cabin location

p<-ggplot(subset(Rawtrain,is.na(Cabin_Charcter)==FALSE),
          aes(x=as.factor(Cabin_Charcter),
              group=as.factor(Survived),fill=as.factor(Survived)))+
  geom_bar( position = "dodge")
p<-p+labs(title=" Bar Plot of Cabin  Vs. Survived ")
p<-p+xlab("Cabin ") + ylab("Count")
p<-p+scale_fill_manual("legend",
                       labels = c("Dead", "Survived"), values = c("red","green"))
p

Normalize (Age, and Ticket Number)

Some of the Machine learning algorithms can leverage normalize values (for the continuous observations) We will create normlize colomns for 1. Fare 2. Ticket Number 3. The various Age options (Mean,Median and RV)

Rawtrain<-Rawtrain%>%
  mutate(Fare_Nor=normalize(Fare))

Rawtest<-Rawtest%>%
  mutate(Fare_Nor=normalize(Fare))

Rawtrain<-Rawtrain%>%
  mutate(TicketNumbers_Nor=normalize(TicketNumbers))

Rawtest<-Rawtest%>%
  mutate(TicketNumbers_Nor=normalize(TicketNumbers))

Rawtrain<-Rawtrain%>%
  mutate(AgeMean_Nor=normalize(AgeMean))

Rawtrain<-Rawtrain%>%
  mutate(AgeMedian_Nor=normalize(AgeMedian))

Rawtrain<-Rawtrain%>%
  mutate(AgeNormal_Nor=normalize(AgeNormal))

Rawtest<-Rawtest%>%
  mutate(AgeMean_Nor=normalize(AgeMean))

Rawtest<-Rawtest%>%
  mutate(AgeMedian_Nor=normalize(AgeMedian))

Rawtest<-Rawtest%>%
  mutate(AgeNormal_Nor=normalize(AgeNormal))


Rawtrain<-Rawtrain%>%
  mutate(GroupNo_Nor=normalize(GroupNo))

Rawtest<-Rawtest%>%
  mutate(GroupNo_Nor=normalize(GroupNo))

Convert Survived to a factor variable

Rawtrain$Survived<-as.factor(Rawtrain$Survived)

Dummy variables

Some models can not handle factor variables with more than 3 levels Creating dummy variable

#Create  dummies variables for items with more than two options 

RawData$Pclass<-as.factor(RawData$Pclass)
RawData$Embarked<-as.factor(RawData$Embarked)
RawData$Title<-as.factor(RawData$Title)


Raw_Dummy<-dummy_cols(RawData,select_columns = c("Pclass","Title","Embarked","Cabin_Charcter","TicketChar"),remove_first_dummy = TRUE)
Raw_Dummy<-Raw_Dummy[,18:ncol(Raw_Dummy)]



#Copy the Cabin Character to the train and test sets 
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Train_Dummy<-Raw_Dummy[-test.index,]
Test_Dummy<-Raw_Dummy[test.index,]

Models

Logoistic Regression

First Models is basic Logistic Regression

These basic models scored ~~ 0.784-0.789

train1<-dplyr::select(Rawtrain,Survived,Pclass,Sex,AgeMean,Fare,Embarked,Title,GroupNo,
               TicketChar,TicketNumbers,Cabin_Charcter,TicketNumbers_Nor,AgeNormal_Nor,
               AgeMedian_Nor,Fare_Nor,TicketNumbers_Nor,GroupNo_Nor,AgeMean_Nor)

test1<-dplyr::select(Rawtest,Pclass,Sex,AgeMean,Fare,Embarked,Title,GroupNo,
               TicketChar,TicketNumbers,Cabin_Charcter,TicketNumbers_Nor,AgeNormal_Nor,
               AgeMedian_Nor,Fare_Nor,TicketNumbers_Nor,GroupNo_Nor,AgeMean_Nor)
#Split Data to train and validiation 
set.seed(1254)
train.index<-sample(1:nrow(train1),0.7*nrow(train1))
xtrain<-train1[train.index,]
valid<-train1[-train.index,]

train_control <- trainControl(method = "cv", number = 10)
set.seed(12345)
#train the model on training set
model.lg1 <- train(Survived ~ Sex+Fare+GroupNo+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title,
               data = xtrain,
                method = "glm",
               trControl =  train_control,
               family = binomial(link = "logit"))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# print cv scores
summary(model.lg1)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7890  -0.4920  -0.3370   0.4438   2.5284  
## 
## Coefficients: (1 not defined because of singularities)
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      4.999e+00  1.083e+00   4.616 3.90e-06 ***
## Sexmale         -3.518e+00  1.295e+00  -2.718  0.00658 ** 
## Fare             3.145e-03  2.943e-03   1.069  0.28522    
## GroupNo         -4.881e-01  1.074e-01  -4.546 5.47e-06 ***
## AgeMean_Nor     -1.752e+00  9.670e-01  -1.812  0.07001 .  
## Cabin_CharcterB -4.376e-01  9.352e-01  -0.468  0.63984    
## Cabin_CharcterC -4.732e-01  8.912e-01  -0.531  0.59543    
## Cabin_CharcterD  3.323e-01  1.004e+00   0.331  0.74068    
## Cabin_CharcterE  9.211e-01  9.515e-01   0.968  0.33301    
## Cabin_CharcterF -2.325e-01  1.706e+00  -0.136  0.89160    
## Cabin_CharcterG -1.709e+00  1.660e+00  -1.029  0.30334    
## Cabin_CharcterU -1.231e+00  9.840e-01  -1.251  0.21080    
## Pclass2         -1.783e-01  6.670e-01  -0.267  0.78921    
## Pclass3         -1.312e+00  6.593e-01  -1.990  0.04661 *  
## EmbarkedQ       -5.833e-01  4.992e-01  -1.168  0.24262    
## EmbarkedS       -6.603e-01  3.192e-01  -2.068  0.03860 *  
## EmbarkedU        1.311e+01  2.400e+03   0.005  0.99564    
## TitleDr.        -1.511e+01  1.686e+03  -0.009  0.99285    
## TitleMaster      3.798e+00  1.472e+00   2.581  0.00986 ** 
## TitleMiss       -8.102e-01  4.303e-01  -1.883  0.05972 .  
## TitleMr.        -4.271e-01  1.263e+00  -0.338  0.73533    
## TitleMrs                NA         NA      NA       NA    
## TitleRev.       -1.497e+01  9.718e+02  -0.015  0.98771    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 828.65  on 622  degrees of freedom
## Residual deviance: 461.92  on 601  degrees of freedom
## AIC: 505.92
## 
## Number of Fisher Scoring iterations: 15
model.lg1
## Generalized Linear Model 
## 
## 623 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 562, 561, 560, 561, 560, 562, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8254082  0.6321406
ypredict<-predict(model.lg1,newdata=valid)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
confusionMatrix(ypredict,valid$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 150  32
##          1  14  72
##                                           
##                Accuracy : 0.8284          
##                  95% CI : (0.7778, 0.8715)
##     No Information Rate : 0.6119          
##     P-Value [Acc > NIR] : 1.201e-14       
##                                           
##                   Kappa : 0.6268          
##  Mcnemar's Test P-Value : 0.01219         
##                                           
##             Sensitivity : 0.9146          
##             Specificity : 0.6923          
##          Pos Pred Value : 0.8242          
##          Neg Pred Value : 0.8372          
##              Prevalence : 0.6119          
##          Detection Rate : 0.5597          
##    Detection Prevalence : 0.6791          
##       Balanced Accuracy : 0.8035          
##                                           
##        'Positive' Class : 0               
## 
#Predict and write to file 
#res<-predict(model.lg1,newdata = test1)
#res<-data.frame(res)
#colnames(res)<-c("Survived")
#outcome<-cbind(Rawtest$PassengerId,res)
#colnames(outcome)<-c("PassengerId","Survived")


#write.csv(outcome,"out.csv",row.names=FALSE)

This submission got 0.78947

Logistic regression with interaction terms

#Split Data to train and validiation 
set.seed(1254)
train.index<-sample(1:nrow(train1),0.7*nrow(train1))
xtrain<-train1[train.index,]
valid<-train1[-train.index,]

train_control <- trainControl(method = "cv", number = 10)
set.seed(12345)
#train the model on training set
model.lg2 <- train(Survived ~ Sex+Fare+GroupNo+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title+Sex*AgeMean_Nor+Title*AgeMean_Nor,
               data = xtrain,
                method = "glm",
               trControl =  train_control,
               family = binomial(link = "logit"))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# print cv scores
summary(model.lg2)
## 
## Call:
## NULL
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8955  -0.5026  -0.3372   0.4404   2.5480  
## 
## Coefficients: (2 not defined because of singularities)
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                4.559e+00  1.381e+00   3.302  0.00096 ***
## Sexmale                   -2.173e+00  7.955e+00  -0.273  0.78470    
## Fare                       3.315e-03  2.979e-03   1.113  0.26580    
## GroupNo                   -5.146e-01  1.145e-01  -4.493 7.01e-06 ***
## AgeMean_Nor               -7.992e-01  2.401e+00  -0.333  0.73922    
## Cabin_CharcterB           -3.847e-01  9.358e-01  -0.411  0.68099    
## Cabin_CharcterC           -4.094e-01  8.961e-01  -0.457  0.64772    
## Cabin_CharcterD            3.777e-01  1.009e+00   0.374  0.70807    
## Cabin_CharcterE            9.281e-01  9.485e-01   0.979  0.32782    
## Cabin_CharcterF           -2.885e-01  1.744e+00  -0.165  0.86859    
## Cabin_CharcterG           -1.903e+00  1.701e+00  -1.119  0.26328    
## Cabin_CharcterU           -1.198e+00  9.868e-01  -1.214  0.22458    
## Pclass2                   -1.410e-01  6.697e-01  -0.211  0.83325    
## Pclass3                   -1.302e+00  6.613e-01  -1.969  0.04892 *  
## EmbarkedQ                 -5.592e-01  5.010e-01  -1.116  0.26431    
## EmbarkedS                 -6.362e-01  3.212e-01  -1.981  0.04762 *  
## EmbarkedU                  1.274e+01  2.400e+03   0.005  0.99576    
## TitleDr.                  -1.825e+01  1.657e+04  -0.001  0.99912    
## TitleMaster                3.127e+00  7.957e+00   0.393  0.69433    
## TitleMiss                 -8.080e-02  1.190e+00  -0.068  0.94586    
## TitleMr.                  -1.515e+00  7.891e+00  -0.192  0.84773    
## TitleMrs                          NA         NA      NA       NA    
## TitleRev.                 -1.676e+01  3.632e+03  -0.005  0.99632    
## `Sexmale:AgeMean_Nor`     -2.417e+00  1.192e+01  -0.203  0.83935    
## `AgeMean_Nor:TitleDr.`     5.045e+00  2.701e+04   0.000  0.99985    
## `AgeMean_Nor:TitleMaster` -2.133e+00  1.685e+01  -0.127  0.89926    
## `AgeMean_Nor:TitleMiss`   -2.257e+00  3.065e+00  -0.736  0.46151    
## `AgeMean_Nor:TitleMr.`     1.755e+00  1.175e+01   0.149  0.88126    
## `AgeMean_Nor:TitleMrs`            NA         NA      NA       NA    
## `AgeMean_Nor:TitleRev.`    2.962e+00  6.511e+03   0.000  0.99964    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 828.65  on 622  degrees of freedom
## Residual deviance: 461.13  on 595  degrees of freedom
## AIC: 517.13
## 
## Number of Fisher Scoring iterations: 15
model.lg2
## Generalized Linear Model 
## 
## 623 samples
##   8 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 562, 561, 560, 561, 560, 562, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8190325  0.6194348
ypredict<-predict(model.lg2,newdata=valid)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
confusionMatrix(ypredict,valid$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 151  32
##          1  13  72
##                                           
##                Accuracy : 0.8321          
##                  95% CI : (0.7819, 0.8748)
##     No Information Rate : 0.6119          
##     P-Value [Acc > NIR] : 3.86e-15        
##                                           
##                   Kappa : 0.6342          
##  Mcnemar's Test P-Value : 0.00729         
##                                           
##             Sensitivity : 0.9207          
##             Specificity : 0.6923          
##          Pos Pred Value : 0.8251          
##          Neg Pred Value : 0.8471          
##              Prevalence : 0.6119          
##          Detection Rate : 0.5634          
##    Detection Prevalence : 0.6828          
##       Balanced Accuracy : 0.8065          
##                                           
##        'Positive' Class : 0               
## 
#Predict and write to file 
#res<-predict(model.lg1,newdata = test1)
#res<-data.frame(res)
#colnames(res)<-c("Survived")
#outcome<-cbind(Rawtest$PassengerId,res)
#colnames(outcome)<-c("PassengerId","Survived")


#write.csv(outcome,"out.csv",row.names=FALSE)

Lasso on Logitic regression

set.seed(12312)
x.train <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, xtrain)

 y<- ifelse(xtrain$Survived == 1, 1, 0)
cv.lasso <- cv.glmnet(x.train, y, alpha = 1, family = "binomial",nfolds = 3)
model.Lasso <- glmnet(x.train, y, alpha = 1, family = "binomial",
                lambda = cv.lasso$lambda.min)




x.test <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, valid)
probabilities <- model.Lasso %>% predict(newx = x.test)
res <- ifelse(probabilities > 0.5, 1, 0)
res<-as.factor(res)
confusionMatrix(res,valid$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 158  45
##          1   6  59
##                                           
##                Accuracy : 0.8097          
##                  95% CI : (0.7575, 0.8549)
##     No Information Rate : 0.6119          
##     P-Value [Acc > NIR] : 2.403e-12       
##                                           
##                   Kappa : 0.5698          
##  Mcnemar's Test P-Value : 1.032e-07       
##                                           
##             Sensitivity : 0.9634          
##             Specificity : 0.5673          
##          Pos Pred Value : 0.7783          
##          Neg Pred Value : 0.9077          
##              Prevalence : 0.6119          
##          Detection Rate : 0.5896          
##    Detection Prevalence : 0.7575          
##       Balanced Accuracy : 0.7654          
##                                           
##        'Positive' Class : 0               
## 

This submission got 0.77990

Ridge on Logitic regression

set.seed(12312)
x.train <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, xtrain)
 y<- ifelse(xtrain$Survived == 1, 1, 0)

cv.Ridge <- cv.glmnet(x.train, y, alpha = 0, family = "binomial")
model.ridge <- glmnet(x.train, y, alpha = 0, family = "binomial",
                lambda = cv.Ridge$lambda.min)

x.test <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, valid)
probabilities <- cv.Ridge %>% predict(newx = x.test)
res <- ifelse(probabilities > 0.5, 1, 0)
res<-as.factor(res)
confusionMatrix(res,valid$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 161  53
##          1   3  51
##                                           
##                Accuracy : 0.791           
##                  95% CI : (0.7374, 0.8381)
##     No Information Rate : 0.6119          
##     P-Value [Acc > NIR] : 2.655e-10       
##                                           
##                   Kappa : 0.5176          
##  Mcnemar's Test P-Value : 5.835e-11       
##                                           
##             Sensitivity : 0.9817          
##             Specificity : 0.4904          
##          Pos Pred Value : 0.7523          
##          Neg Pred Value : 0.9444          
##              Prevalence : 0.6119          
##          Detection Rate : 0.6007          
##    Detection Prevalence : 0.7985          
##       Balanced Accuracy : 0.7360          
##                                           
##        'Positive' Class : 0               
## 

This submission got 0.77511

Random Forest

#Split Data to train and validiation 
set.seed(1254)
train.index<-sample(1:nrow(train1),0.7*nrow(train1))
xtrain<-train1[train.index,]
valid<-train1[-train.index,]


#Fit Random Forest Model
rf = randomForest(as.factor(Survived) ~Sex+Fare+GroupNo+AgeMean+Cabin_Charcter+Pclass+Embarked+Title+TicketChar+TicketNumbers, #
                   ntree = 400,nodesize =5,mtry=22,
                   data = xtrain)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within
## valid range
plot(rf)  

var.imp = data.frame(importance(rf,  
                                 type=2))
# make row names as columns
var.imp$Variables = row.names(var.imp)  
print(var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),])
##                MeanDecreaseGini      Variables
## Title               105.7998374          Title
## TicketNumbers        41.6235774  TicketNumbers
## Fare                 35.2912966           Fare
## Pclass               21.9215027         Pclass
## AgeMean              21.5130449        AgeMean
## Cabin_Charcter       14.5944864 Cabin_Charcter
## GroupNo              11.3479746        GroupNo
## TicketChar            9.1992271     TicketChar
## Embarked              2.2918210       Embarked
## Sex                   0.5205104            Sex
#predict
res<-predict(rf , valid)


confusionMatrix(data = res,  
                reference = valid$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 148  34
##          1  16  70
##                                           
##                Accuracy : 0.8134          
##                  95% CI : (0.7615, 0.8582)
##     No Information Rate : 0.6119          
##     P-Value [Acc > NIR] : 8.745e-13       
##                                           
##                   Kappa : 0.5943          
##  Mcnemar's Test P-Value : 0.01621         
##                                           
##             Sensitivity : 0.9024          
##             Specificity : 0.6731          
##          Pos Pred Value : 0.8132          
##          Neg Pred Value : 0.8140          
##              Prevalence : 0.6119          
##          Detection Rate : 0.5522          
##    Detection Prevalence : 0.6791          
##       Balanced Accuracy : 0.7878          
##                                           
##        'Positive' Class : 0               
## 

Random forest with dummies variables

trainOnlyDummies<-dplyr::select(Rawtrain,Survived,Sex,AgeMean,Fare,GroupNo,
               TicketNumbers)
testOnlyDummies<-dplyr::select(Rawtest,Sex,AgeMean,Fare,GroupNo,
               TicketNumbers)

trainOnlyDummies<-cbind(trainOnlyDummies,Train_Dummy)
testOnlyDummies<-cbind(testOnlyDummies,Test_Dummy)


set.seed(1254)
train.index<-sample(1:nrow(trainOnlyDummies),.7*nrow(trainOnlyDummies))
xtrain<-trainOnlyDummies[train.index,]
valid<-trainOnlyDummies[-train.index,]

set.seed(1254)
#Fit Random Forest Model
rf = randomForest(as.factor(Survived) ~., #
                   ntree = 1500,nodesize =5,mtry=22,
                   data = xtrain)
plot(rf)  

var.imp = data.frame(importance(rf,  
                                 type=2))
# make row names as columns
var.imp$Variables = row.names(var.imp)  
print(var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),])
##                      MeanDecreaseGini            Variables
## Sex                      5.911214e+01                  Sex
## TicketNumbers            3.927393e+01        TicketNumbers
## Fare                     3.429660e+01                 Fare
## AgeMean                  2.858818e+01              AgeMean
## GroupNo                  1.558652e+01              GroupNo
## Title_Mrs                1.451353e+01            Title_Mrs
## Pclass_1                 1.093929e+01             Pclass_1
## Title_Miss               9.993897e+00           Title_Miss
## Title_Master             6.801235e+00         Title_Master
## Pclass_2                 4.736490e+00             Pclass_2
## Cabin_Charcter_E         4.200610e+00     Cabin_Charcter_E
## Embarked_C               3.047645e+00           Embarked_C
## TicketChar_U             2.504929e+00         TicketChar_U
## Cabin_Charcter_D         2.125412e+00     Cabin_Charcter_D
## Embarked_Q               1.425663e+00           Embarked_Q
## Cabin_Charcter_B         1.187897e+00     Cabin_Charcter_B
## Cabin_Charcter_C         1.163246e+00     Cabin_Charcter_C
## TicketChar_PC            1.010283e+00        TicketChar_PC
## TicketChar_STONO2        9.326869e-01    TicketChar_STONO2
## TicketChar_SWPP          8.352090e-01      TicketChar_SWPP
## Cabin_Charcter_A         7.681168e-01     Cabin_Charcter_A
## TicketChar_C             6.823772e-01         TicketChar_C
## TicketChar_CA            5.568782e-01        TicketChar_CA
## TicketChar_WC            4.286416e-01        TicketChar_WC
## TicketChar_SOPP          3.919196e-01      TicketChar_SOPP
## TicketChar_SCPARIS       3.519968e-01   TicketChar_SCPARIS
## TicketChar_SOTONOQ       3.346646e-01   TicketChar_SOTONOQ
## Title_Army               3.336782e-01           Title_Army
## Title_None               2.750759e-01           Title_None
## Cabin_Charcter_F         1.881099e-01     Cabin_Charcter_F
## TicketChar_WEP           1.489556e-01       TicketChar_WEP
## Cabin_Charcter_G         1.483677e-01     Cabin_Charcter_G
## TicketChar_PP            8.317271e-02        TicketChar_PP
## TicketChar_SOC           7.932757e-02       TicketChar_SOC
## TicketChar_FC            7.823381e-02        TicketChar_FC
## TicketChar_FCC           6.072494e-02       TicketChar_FCC
## Title_Rev.               4.615355e-02           Title_Rev.
## TicketChar_PPP           3.889417e-02       TicketChar_PPP
## TicketChar_SCAH          3.376204e-02      TicketChar_SCAH
## Title_Dr.                2.940985e-02            Title_Dr.
## TicketChar_SCAHBasle     1.879568e-02 TicketChar_SCAHBasle
## TicketChar_SC            1.780003e-02        TicketChar_SC
## TicketChar_SOP           1.381581e-02       TicketChar_SOP
## TicketChar_SCParis       1.033442e-02   TicketChar_SCParis
## TicketChar_A4            9.745014e-03        TicketChar_A4
## TicketChar_SP            6.312914e-03        TicketChar_SP
## TicketChar_LINE          2.791275e-03      TicketChar_LINE
## TicketChar_Fa            2.755659e-03        TicketChar_Fa
## Embarked_U               1.642452e-03           Embarked_U
## TicketChar_AS            1.190815e-03        TicketChar_AS
## TicketChar_SCA4          1.098984e-03      TicketChar_SCA4
## TicketChar_CASOTON       1.012628e-03   TicketChar_CASOTON
## TicketChar_SOTONO2       7.496708e-04   TicketChar_SOTONO2
## TicketChar_SCOW          0.000000e+00      TicketChar_SCOW
## TicketChar_SCA3          0.000000e+00      TicketChar_SCA3
## TicketChar_STONOQ        0.000000e+00    TicketChar_STONOQ
## TicketChar_AQ4           0.000000e+00       TicketChar_AQ4
## TicketChar_A2            0.000000e+00        TicketChar_A2
## TicketChar_LP            0.000000e+00        TicketChar_LP
## TicketChar_AQ3           0.000000e+00       TicketChar_AQ3
#predict
res<-predict(rf , valid)


confusionMatrix(data = res,  
                reference = valid$Survived)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 152  38
##          1  12  66
##                                           
##                Accuracy : 0.8134          
##                  95% CI : (0.7615, 0.8582)
##     No Information Rate : 0.6119          
##     P-Value [Acc > NIR] : 8.745e-13       
##                                           
##                   Kappa : 0.5884          
##  Mcnemar's Test P-Value : 0.000407        
##                                           
##             Sensitivity : 0.9268          
##             Specificity : 0.6346          
##          Pos Pred Value : 0.8000          
##          Neg Pred Value : 0.8462          
##              Prevalence : 0.6119          
##          Detection Rate : 0.5672          
##    Detection Prevalence : 0.7090          
##       Balanced Accuracy : 0.7807          
##                                           
##        'Positive' Class : 0               
## 

Tunning paramters

control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
set.seed(seed)
x<-xtrain
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(as.factor(Survived)~., data=x, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
print(rf_default)
## Random Forest 
## 
## 623 samples
##  60 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 560, 561, 560, 561, 562, 561, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.8214231  0.6179442
## 
## Tuning parameter 'mtry' was held constant at a value of 7.81025

Tune Using Caret

The caret package in R provides an excellent facility to tune machine learning algorithm parameters.

As such, only mtry parameter is available in caret for tuning. The reason is its effect on the final accuracy and that it must be found empirically for a dataset.

The ntree parameter is different in that it can be as large as you like, and continues to increases the accuracy up to some point. It is less difficult or critical to tune and could be limited more by compute time available more than anything.

Random and Grid Search for mtry

trainOnlyDummies<-dplyr::select(Rawtrain,Survived,Sex,AgeMean,Fare,GroupNo, TicketNumbers) testOnlyDummies<-dplyr::select(Rawtest,Sex,AgeMean,Fare,GroupNo, TicketNumbers)

trainOnlyDummies<-cbind(trainOnlyDummies,Train_Dummy) testOnlyDummies<-cbind(testOnlyDummies,Test_Dummy)

set.seed(1254) train.index<-sample(1:nrow(trainOnlyDummies),0.7*nrow(trainOnlyDummies)) xtrain<-trainOnlyDummies[train.index,] valid<-trainOnlyDummies[-train.index,]

Grid Searce

control <- trainControl(method=“repeatedcv”, number=10, repeats=3, search=“grid”) set.seed(seed) tunegrid <- expand.grid(.mtry=c(1:25)) rf_gridsearch <- train(as.factor(Survived)~., data=x, method=“rf”, metric=metric, tuneGrid=tunegrid, trControl=control) print(rf_gridsearch) plot(rf_gridsearch)

Extend Caret search for best number of trees

x <- xtrain[,c(1:ncol(xtrain))] y <- xtrain[,1]

customRF <- list(type = “Classification”, library = “randomForest”, loop = NULL) customRF\(parameters <- data.frame(parameter = c("mtry", "ntree"), class = rep("numeric", 2), label = c("mtry", "ntree")) customRF\)grid <- function(x, y, len = NULL, search = “grid”) {} customRF\(fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { randomForest(x, y, mtry = param\)mtry, ntree=param\(ntree, ...) } customRF\)predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata) customRF\(prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata, type = "prob") customRF\)sort <- function(x) x[order(x[,1]),] customRF\(levels <- function(x) x\)classes

train model

control <- trainControl(method=“repeatedcv”, number=10, repeats=3) tunegrid <- expand.grid(.mtry=c(1:25), .ntree=c(1000, 1500, 2000, 2500)) set.seed(seed) custom <- train(as.factor(Survived)~., data=xtrain, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=control) summary(custom) plot(custom)