R markdown for providing my approach to the Titanic Competition in Kaggle This kernel includes data loading cleaning and some exploratory work it also contains basic modeling
A simple normalize function, which normalizes continuous data to values between 0 and 1 some machine learning models work better with normalize data
##########################
#Function normalize #
#Input X vector #
#Return Normlize vector #
# #
# #
#########################
normalize <- function(x) {
return ((x - min(x,na.rm =TRUE)) / (max(x,na.rm =TRUE) - min(x,na.rm =TRUE)))
}
setwd("E:/Elements/kaggle/Kaggle_Titanic")
#setwd("C:/Users/cbs/Documents/R/Kaggle.Titanic")
train<-read.csv("train.csv")
test<-read.csv("test.csv")
#Create Raw train, test and combined data
#Combined data will be used to do common preprocessing (test and train)
#Though we will be careful not to allow a leak of a data
#from train to test data
Rawtrain<-train
Rawtest<-test
# To combine the train and test -
#create dummy Survived column (and then remove it)
Rawtest['Survived']<-0
RawData<-rbind(train,Rawtest)
Rawtest<-subset(Rawtest,select = -Survived)
First, Let’s review the data.
#Create a table of the first ten observations
kable(head(Rawtrain)) %>%
kable_styling(c("striped", "bordered")) %>%
add_indent(c(1, 3, 5))
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked |
|---|---|---|---|---|---|---|---|---|---|---|---|
| 1 | 0 | 3 | Braund, Mr. Owen Harris | male | 22 | 1 | 0 | A/5 21171 | 7.2500 | S | |
| 2 | 1 | 1 | Cumings, Mrs. John Bradley (Florence Briggs Thayer) | female | 38 | 1 | 0 | PC 17599 | 71.2833 | C85 | C |
| 3 | 1 | 3 | Heikkinen, Miss. Laina | female | 26 | 0 | 0 | STON/O2. 3101282 | 7.9250 | S | |
| 4 | 1 | 1 | Futrelle, Mrs. Jacques Heath (Lily May Peel) | female | 35 | 1 | 0 | 113803 | 53.1000 | C123 | S |
| 5 | 0 | 3 | Allen, Mr. William Henry | male | 35 | 0 | 0 | 373450 | 8.0500 | S | |
| 6 | 0 | 3 | Moran, Mr. James | male | NA | 0 | 0 | 330877 | 8.4583 | Q |
The label data, what we need to predict is the Survival column 0 - No 1- Yes This set is a classification set.
One of the first steps is to take a look if the labels are not skew and we have a balance classification.
# Using the ggplot Package
# Plotting a Bar plot of the Survivles Vs. Death
p<-ggplot(data=Rawtrain,aes(x=as.factor(Survived)))+
geom_bar(aes(y = (..count..)) , fill = c('red','green'))
p<-p+xlab("Survived: 0 - No , 1 - Yes") + ylab("Count")
p<-p+labs(title="Survives Versus No Survivors counts ")
p<-p+geom_text(aes(y = (..count..),label = ifelse((..count..)==0,"",
scales::percent((..count..)/sum(..count..)))), stat="count",colour="black")
p
There is some skew as the number of Survival is smaller (ratio of 60%:40%) It is not a large skew if we use cross-validation techniques it probably not a big issue
Let’s review how much data we are missing. Note: Some of the string variables have empty data and not null so first, we replace Empty data by NA
#Some of the missing data is NA, and some (mainly strings) is empty data
#convert empty strings to NA
Rawtrain<-Rawtrain %>%
mutate(Cabin = if_else(as.character(Cabin)=="", NA_character_ ,as.character(Cabin)))
Rawtrain<-Rawtrain %>%
mutate(Ticket = if_else(as.character(Ticket)=="", NA_character_ ,as.character(Ticket)))
Rawtrain<-Rawtrain %>%
mutate(Embarked = if_else(as.character(Embarked)=="", NA_character_ ,as.character(Embarked)))
Rawtest<-Rawtest %>%
mutate(Cabin = if_else(as.character(Cabin)=="", NA_character_ ,as.character(Cabin)))
Rawtest<-Rawtest %>%
mutate(Ticket = if_else(as.character(Ticket)=="", NA_character_ ,as.character(Ticket)))
Rawtest<-Rawtest %>%
mutate(Embarked = if_else(as.character(Embarked)=="", NA_character_ ,as.character(Embarked)))
#Using the Inspect package show all missing items
inspect.na(Rawtrain, barplot=TRUE)
## column_index column_name number_of_NAs ratio_of_NA
## 1 6 Age 177 0.198653199
## 2 11 Cabin 687 0.771043771
## 3 12 Embarked 2 0.002244669
inspect.na(Rawtest, barplot=TRUE)
## column_index column_name number_of_NAs ratio_of_NA
## 1 5 Age 86 0.205741627
## 2 9 Fare 1 0.002392344
## 3 10 Cabin 327 0.782296651
Age and Cabin has a relatively high amount of missing data.
Pclass will be used as a Factor or Categorical variable
#plot Pcalss Versus Death Survived
theme_update(plot.title = element_text(hjust = 0.5))
p<-ggplot(Rawtrain, aes((y =..count..),x=Pclass)) +
geom_bar(aes(fill = as.factor(Survived)), position = "dodge")
p<-p+xlab("Pclass") + ylab("Count")
p<-p+scale_fill_manual(name="Survived/Death ",values=c("red", "green"),
labels=c("Death", "Survived"))
p<-p+labs(title=" Bar Plot Pclass -- Survived ")
p
There are three class types - 1, 2 and 3. It Seems that in Class 3 you have less chance to survive While in Class 1 you had a higher chance to survive In Class 2 the odd are Equal Convert the Pclass to Factor variable
#Convert to a factor
Rawtest$Pclass<-as.factor(Rawtest$Pclass)
Rawtrain$Pclass<-as.factor(Rawtrain$Pclass)
Plot Sex Versus Survived Death
#Plot Sex Versus Survived Death
p<-ggplot(data=Rawtrain,aes(x=as.factor(Survived),
fill=Sex))+geom_bar()
p<-p+scale_fill_manual("legend", values =
c("male" = "blue", "female" = "red"))
p<-p+xlab("Survived: 0 - No, 1 - Yes") + ylab("Count")
p<-p+labs(title="Survives Vs. Death Based on Sex ")
p<-p+geom_text(aes(label=..count..),
stat="count",position=position_stack(0.5))
p
Table of Sex Counts
kable(table(Rawtrain$Sex),col.names = c("Sex","Frequency")) %>%
kable_styling("striped", full_width = F)
| Sex | Frequency |
|---|---|
| female | 314 |
| male | 577 |
Checking the Correlation To check Correlation, we create a Categorical variable 1 - Female 0 - Sex
#Using GGpairs draw correlation graphs
Rawtrain<-mutate(Rawtrain,Sexb=ifelse(Sex=="female",1,0))
Rawtest<-mutate(Rawtest,Sexb=ifelse(Sex=="female",1,0))
ggpairs(subset(Rawtrain,select=c("Survived", "Sexb")))
Conclusion : 1. No Missing data or NA 2. There is some correlation between Sex and Survival 2. Sex looks like a good predictor/Feature to use
Plot Age Versus Survived Death
#Plot Age Versus Survived Death
p<-ggplot(Rawtrain,aes(x=Age,group=as.factor(Survived),
fill=as.factor(Survived)))+
geom_density(alpha = 0.5)
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"),
values = c("red","green"))
p
## Warning: Removed 177 rows containing non-finite values (stat_density).
Later we will use the Data from the Sex and names to complete the missing data. Look for outliers and another way to look at the relation between Age and survivals.
#Boxplot Age - Versus Survived or Death
p<-ggplot(Rawtrain, aes(x=as.factor(Survived),y=Age,
color = as.factor(Survived))) + geom_boxplot()
p<-p+scale_color_manual(values=c("red", "green"))
p<-p+xlab("Survived: 0 - No , 1 - Yes") + ylab("Age - Years")
p<-p+labs(title=" Boxplot Of Survivals/Death -- Age")
p
## Warning: Removed 177 rows containing non-finite values (stat_boxplot).
We can see that Age as a continuous feature/Predictor there are points it is a good predictor and some that it doesn’t
In this part, we are doing some simple text analytics Let’s try to understand what type of word the naming feature can provide The step we will take care 1.Build a corpus 2. Tokenize (converting to tokens, words in this case ) Removing numbers, symbols separators, punctuation and some another standard cleaning (though it is probably less needed here ) 3.Create Uni Grahm 4.Sort according to count (Frequency )
#Create Corpus
corp <- corpus(as.character(RawData$Name))
#Tokenize
Text.Sentences <- tokens(
x = tolower(corp),
remove_punct = TRUE,
remove_twitter = TRUE,
remove_numbers = TRUE,
remove_hyphens = TRUE,
remove_symbols = TRUE,
remove_separators = TRUE,
remove_url = TRUE)
##############################################
# Create Uni Gram #
# #
##############################################
uni_DFM <- dfm(Text.Sentences)
#Trim to Words with Priority higher than 2
#Calculate the Col Sum
sums_U <- colSums(uni_DFM)
#PAck in Data table
uni_words <- data.table(word_1 = names(sums_U), count = sums_U)
#Sort
setorder(uni_words,-count)
#Print Table
kable(head(uni_words,15)) %>%
kable_styling("striped", full_width = F) %>%
add_indent(c(1, 3, 5))
| word_1 | count |
|---|---|
| mr | 763 |
| miss | 260 |
| mrs | 201 |
| william | 87 |
| john | 72 |
| master | 61 |
| henry | 49 |
| charles | 40 |
| james | 38 |
| george | 37 |
| thomas | 36 |
| joseph | 31 |
| mary | 30 |
| edward | 28 |
| elizabeth | 25 |
The most interesting results are Mr, Miss, Mrs and Master (Master in the old days is a boy under 18) We can use this information for two purpose
Based on the results build a new column Mr, Miss, Mrs or Master
#Use the Tokens Vector to build a title column
Text<-sapply(Text.Sentences,as.character)
regex <-
chkmr<-grepl(paste0(sprintf("(?=.*%s)", "mr"), collapse = ''), Text, perl = TRUE)
chkmiss<-grepl(paste0(sprintf("(?=.*%s)", "miss"), collapse = ''), Text, perl = TRUE)
chkmrs<-grepl(paste0(sprintf("(?=.*%s)", "mrs"), collapse = ''), Text, perl = TRUE)
chkmaster<-grepl(paste0(sprintf("(?=.*%s)", "master"), collapse = ''), Text, perl = TRUE)
RawData<-RawData%>%mutate(Title=ifelse(chkmrs,"Mrs",ifelse(chkmiss,"Miss",ifelse(chkmr,"Mr.",
ifelse(chkmaster,"Master","None")))))
Let’s look at a summary after populating the title.
kable(table(RawData$Title),col.names = c("Title","Frequency")) %>%
kable_styling("striped", full_width = F)
| Title | Frequency |
|---|---|
| Master | 61 |
| Miss | 258 |
| Mr. | 758 |
| Mrs | 201 |
| None | 31 |
Let’s look on the None title items
kable(filter(RawData,Title =="None"))%>%
kable_styling(c("striped", "bordered"))
| PassengerId | Survived | Pclass | Name | Sex | Age | SibSp | Parch | Ticket | Fare | Cabin | Embarked | Title |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 31 | 0 | 1 | Uruchurtu, Don. Manuel E | male | 40 | 0 | 0 | PC 17601 | 27.7208 | C | None | |
| 150 | 0 | 2 | Byles, Rev. Thomas Roussel Davids | male | 42 | 0 | 0 | 244310 | 13.0000 | S | None | |
| 151 | 0 | 2 | Bateman, Rev. Robert James | male | 51 | 0 | 0 | S.O.P. 1166 | 12.5250 | S | None | |
| 246 | 0 | 1 | Minahan, Dr. William Edward | male | 44 | 2 | 0 | 19928 | 90.0000 | C78 | Q | None |
| 250 | 0 | 2 | Carter, Rev. Ernest Courtenay | male | 54 | 1 | 0 | 244252 | 26.0000 | S | None | |
| 318 | 0 | 2 | Moraweck, Dr. Ernest | male | 54 | 0 | 0 | 29011 | 14.0000 | S | None | |
| 370 | 1 | 1 | Aubart, Mme. Leontine Pauline | female | 24 | 0 | 0 | PC 17477 | 69.3000 | B35 | C | None |
| 399 | 0 | 2 | Pain, Dr. Alfred | male | 23 | 0 | 0 | 244278 | 10.5000 | S | None | |
| 444 | 1 | 2 | Reynaldo, Ms. Encarnacion | female | 28 | 0 | 0 | 230434 | 13.0000 | S | None | |
| 450 | 1 | 1 | Peuchen, Major. Arthur Godfrey | male | 52 | 0 | 0 | 113786 | 30.5000 | C104 | S | None |
| 537 | 0 | 1 | Butt, Major. Archibald Willingham | male | 45 | 0 | 0 | 113050 | 26.5500 | B38 | S | None |
| 627 | 0 | 2 | Kirkland, Rev. Charles Leonard | male | 57 | 0 | 0 | 219533 | 12.3500 | Q | None | |
| 633 | 1 | 1 | Stahelin-Maeglin, Dr. Max | male | 32 | 0 | 0 | 13214 | 30.5000 | B50 | C | None |
| 642 | 1 | 1 | Sagesser, Mlle. Emma | female | 24 | 0 | 0 | PC 17477 | 69.3000 | B35 | C | None |
| 648 | 1 | 1 | Simonius-Blumer, Col. Oberst Alfons | male | 56 | 0 | 0 | 13213 | 35.5000 | A26 | C | None |
| 661 | 1 | 1 | Frauenthal, Dr. Henry William | male | 50 | 2 | 0 | PC 17611 | 133.6500 | S | None | |
| 695 | 0 | 1 | Weir, Col. John | male | 60 | 0 | 0 | 113800 | 26.5500 | S | None | |
| 746 | 0 | 1 | Crosby, Capt. Edward Gifford | male | 70 | 1 | 1 | WE/P 5735 | 71.0000 | B22 | S | None |
| 760 | 1 | 1 | Rothes, the Countess. of (Lucy Noel Martha Dyer-Edwards) | female | 33 | 0 | 0 | 110152 | 86.5000 | B77 | S | None |
| 767 | 0 | 1 | Brewe, Dr. Arthur Jackson | male | NA | 0 | 0 | 112379 | 39.6000 | C | None | |
| 797 | 1 | 1 | Leader, Dr. Alice (Farnham) | female | 49 | 0 | 0 | 17465 | 25.9292 | D17 | S | None |
| 823 | 0 | 1 | Reuchlin, Jonkheer. John George | male | 38 | 0 | 0 | 19972 | 0.0000 | S | None | |
| 849 | 0 | 2 | Harper, Rev. John | male | 28 | 0 | 1 | 248727 | 33.0000 | S | None | |
| 887 | 0 | 2 | Montvila, Rev. Juozas | male | 27 | 0 | 0 | 211536 | 13.0000 | S | None | |
| 980 | 0 | 3 | O’Donoghue, Ms. Bridget | female | NA | 0 | 0 | 364856 | 7.7500 | Q | None | |
| 1023 | 0 | 1 | Gracie, Col. Archibald IV | male | 53 | 0 | 0 | 113780 | 28.5000 | C51 | C | None |
| 1041 | 0 | 2 | Lahtinen, Rev. William | male | 30 | 1 | 1 | 250651 | 26.0000 | S | None | |
| 1056 | 0 | 2 | Peruschitz, Rev. Joseph Maria | male | 41 | 0 | 0 | 237393 | 13.0000 | S | None | |
| 1094 | 0 | 1 | Astor, Col. John Jacob | male | 47 | 1 | 0 | PC 17757 | 227.5250 | C62 C64 | C | None |
| 1185 | 0 | 1 | Dodge, Dr. Washington | male | 53 | 1 | 1 | 33638 | 81.8583 | A34 | S | None |
| 1306 | 0 | 1 | Oliva y Ocana, Dona. Fermina | female | 39 | 0 | 0 | PC 17758 | 108.9000 | C105 | C | None |
It looks like we should add the following 1. Dr. - For Doctors 2. Rev - 3. Army - Majo. Col. Capt. 4. Dona and Don will be treated as Mrs and Mr.
chkdr<-grepl(paste0(sprintf("(?=.*%s)", "dr"), collapse = ''), Text, perl = TRUE)
chkrev<-grepl(paste0(sprintf("(?=.*%s)", "rev"), collapse = ''), Text, perl = TRUE)
chkArmy1<-grepl(paste0(sprintf("(?=.*%s)", "major"), collapse = ''), Text, perl = TRUE)
chkArmy2<-grepl(paste0(sprintf("(?=.*%s)", "col"), collapse = ''), Text, perl = TRUE)
chkArmy3<-grepl(paste0(sprintf("(?=.*%s)", "capt"), collapse = ''), Text, perl = TRUE)
RawData<-RawData%>%
mutate(Title2=ifelse(chkdr ,"Dr.",ifelse(chkrev,"Rev.",ifelse(chkArmy1,"Army",ifelse(chkArmy2,"Army", ifelse(chkArmy3,"Army","None"))))))
chkdon<-grepl(paste0(sprintf("(?=.*%s)", "don"), collapse = ''), Text, perl = TRUE)
chkdone<-grepl(paste0(sprintf("(?=.*%s)", "dona"), collapse = ''), Text, perl = TRUE)
RawData<-RawData%>%
mutate(Title3=ifelse(chkdon ,"Mr.",ifelse(chkdone,"Mrs" ,"None")))
Merging the three titles
RawData$Title[RawData$Title == "None"] <- RawData$Title2[RawData$Title == "None"]
RawData$Title[RawData$Title == "None"] <- RawData$Title3[RawData$Title == "None"]
RawData <- subset(RawData, select = -c(Title2,Title3))
Rawtrain[,"Title"]<-RawData[1:nrow(Rawtrain),"Title"]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,"Title"]<-RawData[test.index,]$Title
Plot a Survived Death boxplot based on the titles
#Plot a Survived Death boxplot based on the titles
p<-ggplot(Rawtrain, aes((y =..count..),x=Title)) +
geom_bar(aes(fill = as.factor(Survived)), position = "dodge")
p<-p+xlab("Title") + ylab("Count")
p<-p+scale_fill_manual(name="Survived/Death ",
values=c("red", "green"), labels=c("Death", "Survived"))
p<-p+labs(title=" Bar Plot Title -- Survived ")
p
First, let’s see with current data what is the data about Age per Title.
p<-ggplot(Rawtrain, aes(x=as.factor(Title),y=Age,color = as.factor(Title))) + geom_boxplot()
p<-p+xlab("Title") + ylab("Age - Years")
p<-p+labs(title=" Boxplot Of Title -- Age ")
p
## Warning: Removed 177 rows containing non-finite values (stat_boxplot).
Calculate Mean Median and SD for Age per Title ### Train
TitlesOptions<-c("Mr.","Miss","Mrs","Master","Dr.","Army","Rev.")
for (i in 1:length(TitlesOptions))
{
df<-subset(Rawtrain,(Title==TitlesOptions[i]))
Mean<-mean(df$Age,na.rm=TRUE)
Median<-median(df$Age,na.rm=TRUE)
Sdv<-sd(df$Age,na.rm=TRUE)
n = dim((subset(Rawtrain,Title==TitlesOptions[i] & !is.na(Age))))[1]
Esdv <- sqrt(Sdv^2/n)
if (i==1)
DFAgeStat<-data.frame("Title" = TitlesOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv , "Esdv" = Esdv )
else
{
DF<-data.frame("Title" = TitlesOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv , "Esdv" = Esdv )
DFAgeStat<-rbind(DFAgeStat,DF)
}
}
#Show the statisticl results
kable((DFAgeStat)) %>%
kable_styling("striped", full_width = F)
| Title | Mean | Median | Sdv | Esdv |
|---|---|---|---|---|
| Mr. | 32.428750 | 30.0 | 12.709815 | 0.6354907 |
| Miss | 21.777778 | 21.0 | 13.077436 | 1.0897863 |
| Mrs | 35.642857 | 35.0 | 11.506251 | 1.0872386 |
| Master | 4.574167 | 3.5 | 3.619872 | 0.6033119 |
| Dr. | 42.000000 | 46.5 | 12.016655 | 4.9057789 |
| Army | 56.600000 | 56.0 | 9.316652 | 4.1665333 |
| Rev. | 43.166667 | 46.5 | 13.136463 | 5.3629387 |
SexOptions<-c("male","female")
for (i in 1:length(SexOptions))
{
df<-subset(Rawtrain,(Sex==SexOptions[i]))
Mean<-mean(df$Age,na.rm=TRUE)
Median<-median(df$Age,na.rm=TRUE)
Sdv<-sd(df$Age,na.rm=TRUE)
if (i==1)
DFSexStat<-data.frame("Sex" = SexOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv )
else
{
DF<-data.frame("Sex" = SexOptions[i],"Mean" = Mean , "Median" = Median , "Sdv" = Sdv )
DFSexStat<-rbind(DFSexStat,DF)
}
}
kable((DFSexStat)) %>%
kable_styling("striped", full_width = F)
| Sex | Mean | Median | Sdv |
|---|---|---|---|
| male | 30.72664 | 29 | 14.67820 |
| female | 27.91571 | 27 | 14.11015 |
DFAgeStat<-setDT(DFAgeStat)
setkey(DFAgeStat,Title,Mean,Median,Sdv)
The logic will be: If we have Title and missing age we will use the mean title to complete If we have the age data but no Title data we can add the Title (based on age and sex) if we only have sex, we will use it to evaluate the age and title
Preparing three approaches for later user 1. Use the mean 2. Use the median 3. Use Random normal distribution variable
Since this is like building a small model, we will use them Mean Median and the Random variables that is computes using the train data and complete the data using the model for both the test and the train sets
#Handle cases where Age is missing, but we have the Title data
# The
#Loop creating the three options described above
TitlesOptions<-c("Mr.","Miss","Mrs","Master","Dr.","Army","Rev.")
Rawtrain<-mutate(Rawtrain,AgeMean=Age)
Rawtrain<-mutate(Rawtrain,AgeMedian=Age)
Rawtrain<-mutate(Rawtrain,AgeNormal=Age)
set.seed(444)
for (i in 1:length(TitlesOptions))
{
#Mean
Rawtrain$AgeMean[is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Mean
#Median
Rawtrain$AgeMedian[is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Median
#Normal Distrbution variable based on the Mean and SD
#Calculate the number of missing items
n<-dim(subset(Rawtrain,is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]))[1]
#Create a Normal Random Variable with Mean and SD^2/n and sample according to n
Rawtrain$AgeNormal[is.na(Rawtrain$Age) & Rawtrain$Title == TitlesOptions[i]]<-
sample(rnorm(10000,DFAgeStat[.(TitlesOptions[i])]$Mean,DFAgeStat[.(TitlesOptions[i])]$Esdv),n)
}
#Repeat the same process over the test set
Rawtest<-mutate(Rawtest,AgeMean=Age)
Rawtest<-mutate(Rawtest,AgeMedian=Age)
Rawtest<-mutate(Rawtest,AgeNormal=Age)
set.seed(444)
for (i in 1:length(TitlesOptions))
{
#Mean
Rawtest$AgeMean[is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Mean
#Median
Rawtest$AgeMedian[is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]]<- DFAgeStat[.(TitlesOptions[i])]$Median
#Normal Distrbution variable based on the Mean and SD
#Calculate the number of missing items
n<-dim(subset(Rawtest,is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]))[1]
#Create a Normal Random Variable with Mean and SD^2/n and sample according to n
Rawtest$AgeNormal[is.na(Rawtest$Age) & Rawtest$Title == TitlesOptions[i]]<-
sample(rnorm(10000,DFAgeStat[.(TitlesOptions[i])]$Mean,DFAgeStat[.(TitlesOptions[i])]$Esdv),n)
}
Finaly let’s compare the 4 new Age tables to verify the results (Train set)
#Plot the Original Age column (this time without the NA values)
p1<-ggplot(subset(Rawtrain,!is.na(Age)), aes(x=as.factor(Title),
y=Age,color = as.factor(Title))) + geom_boxplot()
p1<-p1+xlab("Title") + ylab("Age - Years")
p1<-p1+labs(title=" Boxplot Of Title -- Age ")
p1<-p1+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p1<-p1+theme(axis.text.x = element_text(size=12 ))
p1<-p1+theme(axis.text.y = element_text(size=12 ))
p1<-p1+ labs(col="Title")
p2<-ggplot(Rawtrain, aes(x=as.factor(Title),y=AgeMean,color = as.factor(Title))) + geom_boxplot()
p2<-p2+xlab("Title") + ylab("Age - Years")
p2<-p2+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Mean ")
p2<-p2+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p2<-p2+theme(axis.text.x = element_text(size=12 ))
p2<-p2+theme(axis.text.y = element_text(size=12 ))
p2<-p2+ labs(col="Title")
p3<-ggplot(Rawtrain, aes(x=as.factor(Title),y=AgeMedian,color = as.factor(Title))) + geom_boxplot()
p3<-p3+xlab("Title") + ylab("Age - Years")
p3<-p3+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Median ")
p3<-p3+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p3<-p3+theme(axis.text.x = element_text(size=12 ))
p3<-p3+theme(axis.text.y = element_text(size=12 ))
p3<-p3+ labs(col="Title")
p4<-ggplot(Rawtrain, aes(x=as.factor(Title),y=AgeNormal,color = as.factor(Title))) + geom_boxplot()
p4<-p4+xlab("Title") + ylab("Age - Years")
p4<-p4+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Normal Random Variable ")
p4<-p4+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p4<-p4+theme(axis.text.x = element_text(size=12 ))
p4<-p4+theme(axis.text.y = element_text(size=12 ))
p4<-p4+ labs(col="Title")
grid.arrange(p1,p2,p3,p4,nrow =4)
Creating boxplot of Age versus titles 1.Original plot (without the NA) 2.With the method of replacing NA by mean 3. With the technique of replacing Na by median 4. With the approach of replacing NA by Random variable
#Plot the Original Age column (this time without the NA values)
p1<-ggplot(subset(Rawtest,!is.na(Age)), aes(x=as.factor(Title),
y=Age,color = as.factor(Title))) +
geom_boxplot()
p1<-p1+xlab("Title") + ylab("Age - Years")
p1<-p1+labs(title=" Boxplot Of Title -- Age ")
p1<-p1+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p1<-p1+theme(axis.text.x = element_text(size=12 ))
p1<-p1+theme(axis.text.y = element_text(size=12 ))
p1<-p1+ labs(col="Title")
#Plot the Age column (this time with Age mean instead of the NA values)
p2<-ggplot(Rawtest, aes(x=as.factor(Title),
y=AgeMean,color = as.factor(Title))) + geom_boxplot()
p2<-p2+xlab("Title") + ylab("Age - Years")
p2<-p2+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Mean ")
p2<-p2+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p2<-p2+theme(axis.text.x = element_text(size=12 ))
p2<-p2+theme(axis.text.y = element_text(size=12 ))
p2<-p2+ labs(col="Title")
#Plot the Age column (this time with Age median instead of the NA values)
p3<-ggplot(Rawtest, aes(x=as.factor(Title),
y=AgeMedian,color = as.factor(Title))) + geom_boxplot()
p3<-p3+xlab("Title") + ylab("Age - Years")
p3<-p3+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Median ")
p3<-p3+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p3<-p3+theme(axis.text.x = element_text(size=12 ))
p3<-p3+theme(axis.text.y = element_text(size=12 ))
p3<-p3+ labs(col="Title")
#Plot the Age column (this time with a normal random variable instead of the NA values)
p4<-ggplot(Rawtest, aes(x=as.factor(Title),
y=AgeNormal,color = as.factor(Title))) + geom_boxplot()
p4<-p4+xlab("Title") + ylab("Age - Years")
p4<-p4+labs(title=" Boxplot Of Title -- Age Missing Data Method: - Normal Random Variable ")
p4<-p4+theme(axis.title.x = element_text(size = rel(1.2)),
axis.title.y = element_text(size = rel(1.2)),
legend.text = element_text(size = rel(1.2)),
axis.text.x.top = element_text(size = rel(1.2)),
legend.title = element_text(size = rel(1.2)))
p4<-p4+theme(axis.text.x = element_text(size=12 ))
p4<-p4+theme(axis.text.y = element_text(size=12 ))
p4<-p4+ labs(col="Title")
grid.arrange(p1,p2,p3,p4,nrow =4)
As we can see in all methods, there is no drastic change (There are more outlier, but this is expected as we had more samples)
We can complete the missing titles data based on Age and Sex We will choose either Mr., Mrs. Miss or Master
#Subset the missing titles
res<-Rawtrain[Rawtrain$Title=="None",]
#Complate based on the following
#Male < 18 - Master
#Female <28 - Miss
#Female >=8 Mrs.
#Male >18 Mr.
Rawtrain[Rawtrain$Title=="None",]$Title<-
ifelse(res$Age<18 & res$Sex=="male","Master",ifelse(res$Age<28 &
res$Sex=="female","Miss",
ifelse(res$Sex=="female" & res$Age>=28,"Mrs","Mr.")))
Rawtest[Rawtest$Title=="None",]
## [1] PassengerId Pclass Name Sex Age
## [6] SibSp Parch Ticket Fare Cabin
## [11] Embarked Sexb Title AgeMean AgeMedian
## [16] AgeNormal
## <0 rows> (or 0-length row.names)
#Convert to Factors
Rawtrain$Title<-as.factor(Rawtrain$Title)
Rawtest$Title<-as.factor(Rawtest$Title)
No missing data for the Test set
Sibsp - Number of siblings/spouses aboard the Titanic Parch - Number of parents/children aboard the Titanic
Plot Sibsp Versus Survived Death
#Plot Sibsp Versus Survived Death
p<-ggplot(Rawtrain,aes(x=as.factor(SibSp),
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_bar()
p<-p+labs(title=" Bar Plot of Sibsp Vs. Survived ")
p<-p+scale_fill_manual("legend",
labels = c("Dead", "Survived"), values = c("red","green"))
p<-p+xlab("Sibsp ") + ylab("Count")
p
Plot Parch Versus Survived Death
#Plot Parch Versus Survived Death
p<-ggplot(Rawtrain,aes(x=as.factor(Parch),
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_bar()
p<-p+labs(title=" Bar Plot of Parch Vs. Survived ")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"),
values = c("red","green"))
p<-p+xlab("Parch ") + ylab("Count")
p
Let’s try to build some categorical parameters based on these two variables Ans see the impact on Survived/Death
First - Is the passengers alone
Rawtrain<-Rawtrain%>%mutate(Isalone=ifelse((SibSp+Parch) > 0,0,1))
Rawtest<-Rawtest%>%mutate(Isalone=ifelse((SibSp+Parch) > 0,0,1))
p<-ggplot(Rawtrain,aes(x=as.factor(Isalone),
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_bar()
p<-p+labs(title=" Bar Plot of Isalone Vs. Survived ")
p<-p+xlab("Is Alone 1 - Yes 0 - No ") + ylab("Count")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p
Create a column with the total number of people in the group
Rawtrain<-Rawtrain%>%mutate(GroupNo=SibSp+Parch)
Rawtest<-Rawtest%>%mutate(GroupNo=SibSp+Parch)
p<-ggplot(Rawtrain,aes(x=as.factor(GroupNo),
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_bar()
p<-p+labs(title=" Bar Plot of Group size Vs. Survived ")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p
It looks like a group of 4 and above has less chance to survive.
#Rawtrain<-Rawtrain%>%mutate(IsLArgeGroup=ifelse(SibSp+Parch>=4,1,0))
#Rawtest<-Rawtest%>%mutate(IsLArgeGroup=ifelse(SibSp+Parch>=4,1,0))
Ticket observations contain a set of characters (or nothing) and then numbers The first step clean the ticket observations and create a column with only the character set and a separate column for the number
#Remove all the digits
#(Since the characters may have numbers also we remove when there is an occurrence of three or more )
RawData<-RawData%>%
mutate(TicketChar = gsub("[0-9]{3,}","",Ticket))
#Mark all empty Ticket Char by U
RawData$TicketChar<-ifelse(RawData$TicketChar=="","U",RawData$TicketChar)
#Create anothher colomn with the numbers
RawData<-RawData%>%
mutate(TicketNumbers = strtoi(str_extract(Ticket,"[0-9]{3,}")))
#In the majority of the data the above
#condition (3 digits or above in a row )
#is Enough, There are few other cases
#where there are only characters, or there is one digit
#Handle these cases
#Extract the Digits and place them in the TicketNumbers
res<-as.numeric((RawData%>%
filter(grepl("\\. [0-9]{1}$",Ticket)))$Ticket%>%str_extract_all(" [0-9]{1}$"))
RawData[grep("\\. [0-9]{1}$",RawData$Ticket),]$TicketNumbers<-res
#make sure that all values are numerics
RawData$TicketNumbers<-
as.numeric(RawData$TicketNumbers)
#Set number 100 to the Ticketnumbers
RawData[is.na(RawData$TicketNumbers),]$TicketNumbers<-
as.integer(mean(RawData$TicketNumbers,na.rm = TRUE))
#Create A normlize TicketNumbers
RawData<-RawData%>%mutate(TicketNumbers_Nor =normalize(TicketNumbers))
#Extract the Character part to TicketChar
res<-((RawData%>%filter(grepl("\\. [0-9]{1}$",Ticket)))$Ticket%>%str_extract_all("[^0-9 ]{1,}"))
RawData[grep("\\. [0-9]{1}$",RawData$Ticket),]$TicketChar<-res
RawData$TicketChar<-as.character(RawData$TicketChar)
#Handle the cases that there are no digits
res<-as.character(subset(RawData,grepl("^[A-Za-z]+$",Ticket,perl=T))$Ticket)
RawData[grep("^[A-Za-z]+$",RawData$Ticket,perl=T),]$TicketChar<-res
#Remove dots spaces and / from the Ticket chars
RawData$TicketChar<-str_replace_all(RawData$TicketChar,"[.]","")
RawData$TicketChar<-str_replace_all(RawData$TicketChar,"[ ]{1,}","")
RawData$TicketChar<-str_replace_all(RawData$TicketChar,"[///]{1,}","")
#Copy the Ticket new columns to the train and test sets
Rawtrain[,c("TicketChar","TicketNumbers","TicketNumbers_Nor")]<-
RawData[1:nrow(Rawtrain),c("TicketChar","TicketNumbers","TicketNumbers_Nor")]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,c("TicketChar","TicketNumbers","TicketNumbers_Nor")]<-
RawData[test.index,c("TicketChar","TicketNumbers","TicketNumbers_Nor")]
We need to verify that in the Test set there are no Ticket characters which are missing in the train set if this is the case we replace them with U
#Create a Match matrix
TrainTicketchr<-sort(unique(Rawtrain$TicketChar))
TestTicketchr<-sort(unique(Rawtest$TicketChar))
MatchTicketChar<-match(TestTicketchr,TrainTicketchr)
#Checkk all NA in the test set
Missing_Items<-TestTicketchr[which(is.na(MatchTicketChar))]
#Check how many items we are missing
count<-0
for(i in 1:length(Missing_Items))
{
count<-count + dim(Rawtest[Rawtest$TicketChar==Missing_Items[i],])[1]
#Change the Missing Test items to U
Rawtest[Rawtest$TicketChar==Missing_Items[i],]$TicketChar<-"U"
}
cat("Missing items",count)
## Missing items 6
#Create common factor levels
df<-rbind(Rawtrain[,-2],Rawtest)
df$TicketChar<-as.factor(df$TicketChar)
Rawtrain$TicketChar<-df[1:nrow(Rawtrain),]$TicketChar
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest$TicketChar<-df[test.index,]$TicketChar
#Plot a barplot of the ticket prefix versus Survived or Death (Omit the Unspecified - U)
p<-ggplot(subset(Rawtrain,TicketChar!="U"),
aes(x=as.factor(TicketChar),
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_bar()
p<-p+labs(title=" Bar Plot of TicketChar (Survived Vs. Death) ")
p<-p+theme(axis.text.x=element_text(angle=45, hjust=1))
p<-p+xlab("Ticket Characters")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p
The fare is a continues parameter There is one observation missing Replace the missing value by the mean
if (sum(is.na(RawData$Fare))>0)
RawData[is.na(RawData$Fare),]$Fare<-mean(RawData$Fare,na.rm = TRUE)
Rawtrain[,c("Fare")]<-RawData[1:nrow(Rawtrain),c("Fare")]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,c("Fare")]<-RawData[test.index,c("Fare")]
plot the Fare Versus Survived or Death
#plot the Fare Versus Survived or Death
p<-ggplot(Rawtrain,aes(x=Fare,
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) ")
p<-p+scale_fill_manual("legend",
labels = c("Dead", "Survived"), values = c("red","green"))
p
Split Fare to subarea
p<-subset(Rawtrain,Fare<50)%>%
ggplot(aes(x=Fare,group=as.factor(Survived),fill=as.factor(Survived)))+
geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) Fare < 50")
p<-p+scale_fill_manual("legend",
labels = c("Dead", "Survived"), values = c("red","green"))
p
p<-subset(Rawtrain,Fare>50 & Fare<100)%>%
ggplot(aes(x=Fare,group=as.factor(Survived),fill=as.factor(Survived)))+
geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) Fare Between 50 and 100")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p
p<-subset(Rawtrain,Fare>100)%>%
ggplot(aes(x=Fare,group=as.factor(Survived),fill=as.factor(Survived)))+
geom_density(alpha = 0.5)
p<-p+labs(title=" Density Plot of Fare (Survived Vs. Death) Fare Above 100")
p<-p+scale_fill_manual("legend",labels = c("Dead", "Survived"), values = c("red","green"))
p
Embarked - Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
#plot Embarked Versus Death Survived
p<-ggplot(Rawtrain, aes((y =..count..),x=Embarked)) +
geom_bar(aes(fill = as.factor(Survived)), position = "dodge")
p<-p+xlab("Embarked") + ylab("Count")
p<-p+scale_fill_manual(name="Survived/Death ",values=c("red", "green"),
labels=c("Death", "Survived"))
p<-p+labs(title=" Bar Plot Embarked -- Survived ")
p
#There are two missing values
#Assign U to these values
RawData$Embarked<-as.character(RawData$Embarked)
if (sum(RawData$Embarked=="")>0)
RawData[RawData$Embarked=="",]$Embarked<-"U"
RawData$Embarked<-as.factor(RawData$Embarked)
Rawtrain$Embarked<-RawData[1:nrow(Rawtrain),]$Embarked
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest$Embarked<-RawData[test.index,]$Embarked
The cabin data has a high percentage of missing data First, let’s split the Cabin to the character and digits. Some of the cabin cells include more than one cabin, but they are adjacent We will use only the first one
RawData<-RawData%>%
mutate(Cabin_Charcter = str_extract(Cabin,"[A-Za-z][0-9]"))
RawData$Cabin_Charcter<-str_extract(RawData$Cabin_Charcter,"^[A-Za-z]")
#Mark all missing cabin character as U
RawData[is.na(RawData$Cabin_Charcter),]$Cabin_Charcter<-"U"
#Convert to Factor
RawData$Cabin_Charcter<-as.factor(RawData$Cabin_Charcter)
#Copy the Cabin Character to the train and test sets
Rawtrain[,"Cabin_Charcter"]<-RawData[1:nrow(Rawtrain),"Cabin_Charcter"]
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Rawtest[,"Cabin_Charcter"]<-RawData[test.index,]$Cabin_Charcter
Lets see the Survivle/Death Versus the Cabin location
p<-ggplot(subset(Rawtrain,is.na(Cabin_Charcter)==FALSE),
aes(x=as.factor(Cabin_Charcter),
group=as.factor(Survived),fill=as.factor(Survived)))+
geom_bar( position = "dodge")
p<-p+labs(title=" Bar Plot of Cabin Vs. Survived ")
p<-p+xlab("Cabin ") + ylab("Count")
p<-p+scale_fill_manual("legend",
labels = c("Dead", "Survived"), values = c("red","green"))
p
Some of the Machine learning algorithms can leverage normalize values (for the continuous observations) We will create normlize colomns for 1. Fare 2. Ticket Number 3. The various Age options (Mean,Median and RV)
Rawtrain<-Rawtrain%>%
mutate(Fare_Nor=normalize(Fare))
Rawtest<-Rawtest%>%
mutate(Fare_Nor=normalize(Fare))
Rawtrain<-Rawtrain%>%
mutate(TicketNumbers_Nor=normalize(TicketNumbers))
Rawtest<-Rawtest%>%
mutate(TicketNumbers_Nor=normalize(TicketNumbers))
Rawtrain<-Rawtrain%>%
mutate(AgeMean_Nor=normalize(AgeMean))
Rawtrain<-Rawtrain%>%
mutate(AgeMedian_Nor=normalize(AgeMedian))
Rawtrain<-Rawtrain%>%
mutate(AgeNormal_Nor=normalize(AgeNormal))
Rawtest<-Rawtest%>%
mutate(AgeMean_Nor=normalize(AgeMean))
Rawtest<-Rawtest%>%
mutate(AgeMedian_Nor=normalize(AgeMedian))
Rawtest<-Rawtest%>%
mutate(AgeNormal_Nor=normalize(AgeNormal))
Rawtrain<-Rawtrain%>%
mutate(GroupNo_Nor=normalize(GroupNo))
Rawtest<-Rawtest%>%
mutate(GroupNo_Nor=normalize(GroupNo))
Convert Survived to a factor variable
Rawtrain$Survived<-as.factor(Rawtrain$Survived)
Some models can not handle factor variables with more than 3 levels Creating dummy variable
#Create dummies variables for items with more than two options
RawData$Pclass<-as.factor(RawData$Pclass)
RawData$Embarked<-as.factor(RawData$Embarked)
RawData$Title<-as.factor(RawData$Title)
Raw_Dummy<-dummy_cols(RawData,select_columns = c("Pclass","Title","Embarked","Cabin_Charcter","TicketChar"),remove_first_dummy = TRUE)
Raw_Dummy<-Raw_Dummy[,18:ncol(Raw_Dummy)]
#Copy the Cabin Character to the train and test sets
test.index<-(nrow(Rawtrain)+1):(nrow(Rawtrain)+nrow(Rawtest))
Train_Dummy<-Raw_Dummy[-test.index,]
Test_Dummy<-Raw_Dummy[test.index,]
These basic models scored ~~ 0.784-0.789
train1<-dplyr::select(Rawtrain,Survived,Pclass,Sex,AgeMean,Fare,Embarked,Title,GroupNo,
TicketChar,TicketNumbers,Cabin_Charcter,TicketNumbers_Nor,AgeNormal_Nor,
AgeMedian_Nor,Fare_Nor,TicketNumbers_Nor,GroupNo_Nor,AgeMean_Nor)
test1<-dplyr::select(Rawtest,Pclass,Sex,AgeMean,Fare,Embarked,Title,GroupNo,
TicketChar,TicketNumbers,Cabin_Charcter,TicketNumbers_Nor,AgeNormal_Nor,
AgeMedian_Nor,Fare_Nor,TicketNumbers_Nor,GroupNo_Nor,AgeMean_Nor)
#Split Data to train and validiation
set.seed(1254)
train.index<-sample(1:nrow(train1),0.7*nrow(train1))
xtrain<-train1[train.index,]
valid<-train1[-train.index,]
train_control <- trainControl(method = "cv", number = 10)
set.seed(12345)
#train the model on training set
model.lg1 <- train(Survived ~ Sex+Fare+GroupNo+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title,
data = xtrain,
method = "glm",
trControl = train_control,
family = binomial(link = "logit"))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# print cv scores
summary(model.lg1)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7890 -0.4920 -0.3370 0.4438 2.5284
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.999e+00 1.083e+00 4.616 3.90e-06 ***
## Sexmale -3.518e+00 1.295e+00 -2.718 0.00658 **
## Fare 3.145e-03 2.943e-03 1.069 0.28522
## GroupNo -4.881e-01 1.074e-01 -4.546 5.47e-06 ***
## AgeMean_Nor -1.752e+00 9.670e-01 -1.812 0.07001 .
## Cabin_CharcterB -4.376e-01 9.352e-01 -0.468 0.63984
## Cabin_CharcterC -4.732e-01 8.912e-01 -0.531 0.59543
## Cabin_CharcterD 3.323e-01 1.004e+00 0.331 0.74068
## Cabin_CharcterE 9.211e-01 9.515e-01 0.968 0.33301
## Cabin_CharcterF -2.325e-01 1.706e+00 -0.136 0.89160
## Cabin_CharcterG -1.709e+00 1.660e+00 -1.029 0.30334
## Cabin_CharcterU -1.231e+00 9.840e-01 -1.251 0.21080
## Pclass2 -1.783e-01 6.670e-01 -0.267 0.78921
## Pclass3 -1.312e+00 6.593e-01 -1.990 0.04661 *
## EmbarkedQ -5.833e-01 4.992e-01 -1.168 0.24262
## EmbarkedS -6.603e-01 3.192e-01 -2.068 0.03860 *
## EmbarkedU 1.311e+01 2.400e+03 0.005 0.99564
## TitleDr. -1.511e+01 1.686e+03 -0.009 0.99285
## TitleMaster 3.798e+00 1.472e+00 2.581 0.00986 **
## TitleMiss -8.102e-01 4.303e-01 -1.883 0.05972 .
## TitleMr. -4.271e-01 1.263e+00 -0.338 0.73533
## TitleMrs NA NA NA NA
## TitleRev. -1.497e+01 9.718e+02 -0.015 0.98771
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 828.65 on 622 degrees of freedom
## Residual deviance: 461.92 on 601 degrees of freedom
## AIC: 505.92
##
## Number of Fisher Scoring iterations: 15
model.lg1
## Generalized Linear Model
##
## 623 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 562, 561, 560, 561, 560, 562, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8254082 0.6321406
ypredict<-predict(model.lg1,newdata=valid)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
confusionMatrix(ypredict,valid$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 150 32
## 1 14 72
##
## Accuracy : 0.8284
## 95% CI : (0.7778, 0.8715)
## No Information Rate : 0.6119
## P-Value [Acc > NIR] : 1.201e-14
##
## Kappa : 0.6268
## Mcnemar's Test P-Value : 0.01219
##
## Sensitivity : 0.9146
## Specificity : 0.6923
## Pos Pred Value : 0.8242
## Neg Pred Value : 0.8372
## Prevalence : 0.6119
## Detection Rate : 0.5597
## Detection Prevalence : 0.6791
## Balanced Accuracy : 0.8035
##
## 'Positive' Class : 0
##
#Predict and write to file
#res<-predict(model.lg1,newdata = test1)
#res<-data.frame(res)
#colnames(res)<-c("Survived")
#outcome<-cbind(Rawtest$PassengerId,res)
#colnames(outcome)<-c("PassengerId","Survived")
#write.csv(outcome,"out.csv",row.names=FALSE)
This submission got 0.78947
#Split Data to train and validiation
set.seed(1254)
train.index<-sample(1:nrow(train1),0.7*nrow(train1))
xtrain<-train1[train.index,]
valid<-train1[-train.index,]
train_control <- trainControl(method = "cv", number = 10)
set.seed(12345)
#train the model on training set
model.lg2 <- train(Survived ~ Sex+Fare+GroupNo+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title+Sex*AgeMean_Nor+Title*AgeMean_Nor,
data = xtrain,
method = "glm",
trControl = train_control,
family = binomial(link = "logit"))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# print cv scores
summary(model.lg2)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8955 -0.5026 -0.3372 0.4404 2.5480
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 4.559e+00 1.381e+00 3.302 0.00096 ***
## Sexmale -2.173e+00 7.955e+00 -0.273 0.78470
## Fare 3.315e-03 2.979e-03 1.113 0.26580
## GroupNo -5.146e-01 1.145e-01 -4.493 7.01e-06 ***
## AgeMean_Nor -7.992e-01 2.401e+00 -0.333 0.73922
## Cabin_CharcterB -3.847e-01 9.358e-01 -0.411 0.68099
## Cabin_CharcterC -4.094e-01 8.961e-01 -0.457 0.64772
## Cabin_CharcterD 3.777e-01 1.009e+00 0.374 0.70807
## Cabin_CharcterE 9.281e-01 9.485e-01 0.979 0.32782
## Cabin_CharcterF -2.885e-01 1.744e+00 -0.165 0.86859
## Cabin_CharcterG -1.903e+00 1.701e+00 -1.119 0.26328
## Cabin_CharcterU -1.198e+00 9.868e-01 -1.214 0.22458
## Pclass2 -1.410e-01 6.697e-01 -0.211 0.83325
## Pclass3 -1.302e+00 6.613e-01 -1.969 0.04892 *
## EmbarkedQ -5.592e-01 5.010e-01 -1.116 0.26431
## EmbarkedS -6.362e-01 3.212e-01 -1.981 0.04762 *
## EmbarkedU 1.274e+01 2.400e+03 0.005 0.99576
## TitleDr. -1.825e+01 1.657e+04 -0.001 0.99912
## TitleMaster 3.127e+00 7.957e+00 0.393 0.69433
## TitleMiss -8.080e-02 1.190e+00 -0.068 0.94586
## TitleMr. -1.515e+00 7.891e+00 -0.192 0.84773
## TitleMrs NA NA NA NA
## TitleRev. -1.676e+01 3.632e+03 -0.005 0.99632
## `Sexmale:AgeMean_Nor` -2.417e+00 1.192e+01 -0.203 0.83935
## `AgeMean_Nor:TitleDr.` 5.045e+00 2.701e+04 0.000 0.99985
## `AgeMean_Nor:TitleMaster` -2.133e+00 1.685e+01 -0.127 0.89926
## `AgeMean_Nor:TitleMiss` -2.257e+00 3.065e+00 -0.736 0.46151
## `AgeMean_Nor:TitleMr.` 1.755e+00 1.175e+01 0.149 0.88126
## `AgeMean_Nor:TitleMrs` NA NA NA NA
## `AgeMean_Nor:TitleRev.` 2.962e+00 6.511e+03 0.000 0.99964
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 828.65 on 622 degrees of freedom
## Residual deviance: 461.13 on 595 degrees of freedom
## AIC: 517.13
##
## Number of Fisher Scoring iterations: 15
model.lg2
## Generalized Linear Model
##
## 623 samples
## 8 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 562, 561, 560, 561, 560, 562, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8190325 0.6194348
ypredict<-predict(model.lg2,newdata=valid)
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
confusionMatrix(ypredict,valid$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 151 32
## 1 13 72
##
## Accuracy : 0.8321
## 95% CI : (0.7819, 0.8748)
## No Information Rate : 0.6119
## P-Value [Acc > NIR] : 3.86e-15
##
## Kappa : 0.6342
## Mcnemar's Test P-Value : 0.00729
##
## Sensitivity : 0.9207
## Specificity : 0.6923
## Pos Pred Value : 0.8251
## Neg Pred Value : 0.8471
## Prevalence : 0.6119
## Detection Rate : 0.5634
## Detection Prevalence : 0.6828
## Balanced Accuracy : 0.8065
##
## 'Positive' Class : 0
##
#Predict and write to file
#res<-predict(model.lg1,newdata = test1)
#res<-data.frame(res)
#colnames(res)<-c("Survived")
#outcome<-cbind(Rawtest$PassengerId,res)
#colnames(outcome)<-c("PassengerId","Survived")
#write.csv(outcome,"out.csv",row.names=FALSE)
set.seed(12312)
x.train <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, xtrain)
y<- ifelse(xtrain$Survived == 1, 1, 0)
cv.lasso <- cv.glmnet(x.train, y, alpha = 1, family = "binomial",nfolds = 3)
model.Lasso <- glmnet(x.train, y, alpha = 1, family = "binomial",
lambda = cv.lasso$lambda.min)
x.test <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, valid)
probabilities <- model.Lasso %>% predict(newx = x.test)
res <- ifelse(probabilities > 0.5, 1, 0)
res<-as.factor(res)
confusionMatrix(res,valid$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 158 45
## 1 6 59
##
## Accuracy : 0.8097
## 95% CI : (0.7575, 0.8549)
## No Information Rate : 0.6119
## P-Value [Acc > NIR] : 2.403e-12
##
## Kappa : 0.5698
## Mcnemar's Test P-Value : 1.032e-07
##
## Sensitivity : 0.9634
## Specificity : 0.5673
## Pos Pred Value : 0.7783
## Neg Pred Value : 0.9077
## Prevalence : 0.6119
## Detection Rate : 0.5896
## Detection Prevalence : 0.7575
## Balanced Accuracy : 0.7654
##
## 'Positive' Class : 0
##
This submission got 0.77990
set.seed(12312)
x.train <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, xtrain)
y<- ifelse(xtrain$Survived == 1, 1, 0)
cv.Ridge <- cv.glmnet(x.train, y, alpha = 0, family = "binomial")
model.ridge <- glmnet(x.train, y, alpha = 0, family = "binomial",
lambda = cv.Ridge$lambda.min)
x.test <- model.matrix(Survived ~Sex+Fare_Nor+GroupNo_Nor+AgeMean_Nor+Cabin_Charcter+Pclass+Embarked+Title, valid)
probabilities <- cv.Ridge %>% predict(newx = x.test)
res <- ifelse(probabilities > 0.5, 1, 0)
res<-as.factor(res)
confusionMatrix(res,valid$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 161 53
## 1 3 51
##
## Accuracy : 0.791
## 95% CI : (0.7374, 0.8381)
## No Information Rate : 0.6119
## P-Value [Acc > NIR] : 2.655e-10
##
## Kappa : 0.5176
## Mcnemar's Test P-Value : 5.835e-11
##
## Sensitivity : 0.9817
## Specificity : 0.4904
## Pos Pred Value : 0.7523
## Neg Pred Value : 0.9444
## Prevalence : 0.6119
## Detection Rate : 0.6007
## Detection Prevalence : 0.7985
## Balanced Accuracy : 0.7360
##
## 'Positive' Class : 0
##
This submission got 0.77511
#Split Data to train and validiation
set.seed(1254)
train.index<-sample(1:nrow(train1),0.7*nrow(train1))
xtrain<-train1[train.index,]
valid<-train1[-train.index,]
#Fit Random Forest Model
rf = randomForest(as.factor(Survived) ~Sex+Fare+GroupNo+AgeMean+Cabin_Charcter+Pclass+Embarked+Title+TicketChar+TicketNumbers, #
ntree = 400,nodesize =5,mtry=22,
data = xtrain)
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within
## valid range
plot(rf)
var.imp = data.frame(importance(rf,
type=2))
# make row names as columns
var.imp$Variables = row.names(var.imp)
print(var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),])
## MeanDecreaseGini Variables
## Title 105.7998374 Title
## TicketNumbers 41.6235774 TicketNumbers
## Fare 35.2912966 Fare
## Pclass 21.9215027 Pclass
## AgeMean 21.5130449 AgeMean
## Cabin_Charcter 14.5944864 Cabin_Charcter
## GroupNo 11.3479746 GroupNo
## TicketChar 9.1992271 TicketChar
## Embarked 2.2918210 Embarked
## Sex 0.5205104 Sex
#predict
res<-predict(rf , valid)
confusionMatrix(data = res,
reference = valid$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 148 34
## 1 16 70
##
## Accuracy : 0.8134
## 95% CI : (0.7615, 0.8582)
## No Information Rate : 0.6119
## P-Value [Acc > NIR] : 8.745e-13
##
## Kappa : 0.5943
## Mcnemar's Test P-Value : 0.01621
##
## Sensitivity : 0.9024
## Specificity : 0.6731
## Pos Pred Value : 0.8132
## Neg Pred Value : 0.8140
## Prevalence : 0.6119
## Detection Rate : 0.5522
## Detection Prevalence : 0.6791
## Balanced Accuracy : 0.7878
##
## 'Positive' Class : 0
##
trainOnlyDummies<-dplyr::select(Rawtrain,Survived,Sex,AgeMean,Fare,GroupNo,
TicketNumbers)
testOnlyDummies<-dplyr::select(Rawtest,Sex,AgeMean,Fare,GroupNo,
TicketNumbers)
trainOnlyDummies<-cbind(trainOnlyDummies,Train_Dummy)
testOnlyDummies<-cbind(testOnlyDummies,Test_Dummy)
set.seed(1254)
train.index<-sample(1:nrow(trainOnlyDummies),.7*nrow(trainOnlyDummies))
xtrain<-trainOnlyDummies[train.index,]
valid<-trainOnlyDummies[-train.index,]
set.seed(1254)
#Fit Random Forest Model
rf = randomForest(as.factor(Survived) ~., #
ntree = 1500,nodesize =5,mtry=22,
data = xtrain)
plot(rf)
var.imp = data.frame(importance(rf,
type=2))
# make row names as columns
var.imp$Variables = row.names(var.imp)
print(var.imp[order(var.imp$MeanDecreaseGini,decreasing = T),])
## MeanDecreaseGini Variables
## Sex 5.911214e+01 Sex
## TicketNumbers 3.927393e+01 TicketNumbers
## Fare 3.429660e+01 Fare
## AgeMean 2.858818e+01 AgeMean
## GroupNo 1.558652e+01 GroupNo
## Title_Mrs 1.451353e+01 Title_Mrs
## Pclass_1 1.093929e+01 Pclass_1
## Title_Miss 9.993897e+00 Title_Miss
## Title_Master 6.801235e+00 Title_Master
## Pclass_2 4.736490e+00 Pclass_2
## Cabin_Charcter_E 4.200610e+00 Cabin_Charcter_E
## Embarked_C 3.047645e+00 Embarked_C
## TicketChar_U 2.504929e+00 TicketChar_U
## Cabin_Charcter_D 2.125412e+00 Cabin_Charcter_D
## Embarked_Q 1.425663e+00 Embarked_Q
## Cabin_Charcter_B 1.187897e+00 Cabin_Charcter_B
## Cabin_Charcter_C 1.163246e+00 Cabin_Charcter_C
## TicketChar_PC 1.010283e+00 TicketChar_PC
## TicketChar_STONO2 9.326869e-01 TicketChar_STONO2
## TicketChar_SWPP 8.352090e-01 TicketChar_SWPP
## Cabin_Charcter_A 7.681168e-01 Cabin_Charcter_A
## TicketChar_C 6.823772e-01 TicketChar_C
## TicketChar_CA 5.568782e-01 TicketChar_CA
## TicketChar_WC 4.286416e-01 TicketChar_WC
## TicketChar_SOPP 3.919196e-01 TicketChar_SOPP
## TicketChar_SCPARIS 3.519968e-01 TicketChar_SCPARIS
## TicketChar_SOTONOQ 3.346646e-01 TicketChar_SOTONOQ
## Title_Army 3.336782e-01 Title_Army
## Title_None 2.750759e-01 Title_None
## Cabin_Charcter_F 1.881099e-01 Cabin_Charcter_F
## TicketChar_WEP 1.489556e-01 TicketChar_WEP
## Cabin_Charcter_G 1.483677e-01 Cabin_Charcter_G
## TicketChar_PP 8.317271e-02 TicketChar_PP
## TicketChar_SOC 7.932757e-02 TicketChar_SOC
## TicketChar_FC 7.823381e-02 TicketChar_FC
## TicketChar_FCC 6.072494e-02 TicketChar_FCC
## Title_Rev. 4.615355e-02 Title_Rev.
## TicketChar_PPP 3.889417e-02 TicketChar_PPP
## TicketChar_SCAH 3.376204e-02 TicketChar_SCAH
## Title_Dr. 2.940985e-02 Title_Dr.
## TicketChar_SCAHBasle 1.879568e-02 TicketChar_SCAHBasle
## TicketChar_SC 1.780003e-02 TicketChar_SC
## TicketChar_SOP 1.381581e-02 TicketChar_SOP
## TicketChar_SCParis 1.033442e-02 TicketChar_SCParis
## TicketChar_A4 9.745014e-03 TicketChar_A4
## TicketChar_SP 6.312914e-03 TicketChar_SP
## TicketChar_LINE 2.791275e-03 TicketChar_LINE
## TicketChar_Fa 2.755659e-03 TicketChar_Fa
## Embarked_U 1.642452e-03 Embarked_U
## TicketChar_AS 1.190815e-03 TicketChar_AS
## TicketChar_SCA4 1.098984e-03 TicketChar_SCA4
## TicketChar_CASOTON 1.012628e-03 TicketChar_CASOTON
## TicketChar_SOTONO2 7.496708e-04 TicketChar_SOTONO2
## TicketChar_SCOW 0.000000e+00 TicketChar_SCOW
## TicketChar_SCA3 0.000000e+00 TicketChar_SCA3
## TicketChar_STONOQ 0.000000e+00 TicketChar_STONOQ
## TicketChar_AQ4 0.000000e+00 TicketChar_AQ4
## TicketChar_A2 0.000000e+00 TicketChar_A2
## TicketChar_LP 0.000000e+00 TicketChar_LP
## TicketChar_AQ3 0.000000e+00 TicketChar_AQ3
#predict
res<-predict(rf , valid)
confusionMatrix(data = res,
reference = valid$Survived)
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 152 38
## 1 12 66
##
## Accuracy : 0.8134
## 95% CI : (0.7615, 0.8582)
## No Information Rate : 0.6119
## P-Value [Acc > NIR] : 8.745e-13
##
## Kappa : 0.5884
## Mcnemar's Test P-Value : 0.000407
##
## Sensitivity : 0.9268
## Specificity : 0.6346
## Pos Pred Value : 0.8000
## Neg Pred Value : 0.8462
## Prevalence : 0.6119
## Detection Rate : 0.5672
## Detection Prevalence : 0.7090
## Balanced Accuracy : 0.7807
##
## 'Positive' Class : 0
##
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
set.seed(seed)
x<-xtrain
mtry <- sqrt(ncol(x))
tunegrid <- expand.grid(.mtry=mtry)
rf_default <- train(as.factor(Survived)~., data=x, method="rf", metric=metric, tuneGrid=tunegrid, trControl=control)
print(rf_default)
## Random Forest
##
## 623 samples
## 60 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 560, 561, 560, 561, 562, 561, ...
## Resampling results:
##
## Accuracy Kappa
## 0.8214231 0.6179442
##
## Tuning parameter 'mtry' was held constant at a value of 7.81025
The caret package in R provides an excellent facility to tune machine learning algorithm parameters.
As such, only mtry parameter is available in caret for tuning. The reason is its effect on the final accuracy and that it must be found empirically for a dataset.
The ntree parameter is different in that it can be as large as you like, and continues to increases the accuracy up to some point. It is less difficult or critical to tune and could be limited more by compute time available more than anything.
trainOnlyDummies<-dplyr::select(Rawtrain,Survived,Sex,AgeMean,Fare,GroupNo, TicketNumbers) testOnlyDummies<-dplyr::select(Rawtest,Sex,AgeMean,Fare,GroupNo, TicketNumbers)
trainOnlyDummies<-cbind(trainOnlyDummies,Train_Dummy) testOnlyDummies<-cbind(testOnlyDummies,Test_Dummy)
set.seed(1254) train.index<-sample(1:nrow(trainOnlyDummies),0.7*nrow(trainOnlyDummies)) xtrain<-trainOnlyDummies[train.index,] valid<-trainOnlyDummies[-train.index,]
control <- trainControl(method=“repeatedcv”, number=10, repeats=3, search=“random”) set.seed(seed) mtry <- sqrt(ncol(x)) rf_random <- train(as.factor(Survived)~., data=x, method=“rf”, metric=metric, tuneLength=15, trControl=control) print(rf_random) plot(rf_random)
control <- trainControl(method=“repeatedcv”, number=10, repeats=3, search=“grid”) set.seed(seed) tunegrid <- expand.grid(.mtry=c(1:25)) rf_gridsearch <- train(as.factor(Survived)~., data=x, method=“rf”, metric=metric, tuneGrid=tunegrid, trControl=control) print(rf_gridsearch) plot(rf_gridsearch)
x <- xtrain[,c(1:ncol(xtrain))] y <- xtrain[,1]
customRF <- list(type = “Classification”, library = “randomForest”, loop = NULL) customRF\(parameters <- data.frame(parameter = c("mtry", "ntree"), class = rep("numeric", 2), label = c("mtry", "ntree")) customRF\)grid <- function(x, y, len = NULL, search = “grid”) {} customRF\(fit <- function(x, y, wts, param, lev, last, weights, classProbs, ...) { randomForest(x, y, mtry = param\)mtry, ntree=param\(ntree, ...) } customRF\)predict <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata) customRF\(prob <- function(modelFit, newdata, preProc = NULL, submodels = NULL) predict(modelFit, newdata, type = "prob") customRF\)sort <- function(x) x[order(x[,1]),] customRF\(levels <- function(x) x\)classes
control <- trainControl(method=“repeatedcv”, number=10, repeats=3) tunegrid <- expand.grid(.mtry=c(1:25), .ntree=c(1000, 1500, 2000, 2500)) set.seed(seed) custom <- train(as.factor(Survived)~., data=xtrain, method=customRF, metric=metric, tuneGrid=tunegrid, trControl=control) summary(custom) plot(custom)