I learnt this from various sites starting from R datacamp, kaggle website and some of the blogs which I read on how this problem could be done using simple classification to random forest..
First Kraggle competition on Machine Learning using Titanic Disaster.Got the train.csv and test.csv files from kaggle website
#train_url <- "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/train.csv"
#train <- read.csv(train_url)
# Import the testing set: test
#test_url <- "http://s3.amazonaws.com/assets.datacamp.com/course/Kaggle/test.csv"
#test <- read.csv(test_url)
train_data<- read.csv("~/Google Drive/Kaggle/Titanic-Disaster/train.csv")
str(train_data)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Passengers survived vs passengers passed away ; with respect to Female and Male
table(train_data$Survived)
##
## 0 1
## 549 342
#As proportions
prop.table(table(train_data$Survived))
##
## 0 1
## 0.6161616 0.3838384
table(train_data$Sex,train_data$Survived)
##
## 0 1
## female 81 233
## male 468 109
# As row-wise proportions
prop.table(table(train_data$Sex,train_data$Survived),margin=1)
##
## 0 1
## female 0.2579618 0.7420382
## male 0.8110919 0.1889081
# 74% females survived
# only 19% men survived
Create the column child, and indicate whether child or no child
train_data$Child <-0
train_data$Child[train_data$Age<18] <- 1
# Two-way comparison
prop.table(table(train_data$Child,train_data$Survived),1)
##
## 0 1
## 0 0.6388175 0.3611825
## 1 0.4601770 0.5398230
# 54% adult survived
# only 38% child survived
So we could do 2 predictions for the test data; for that copy the test_data to test_one and test_two
test_data<- read.csv("~/Google Drive/Kaggle/Titanic-Disaster/test.csv")
PREDICTION 1
test_one <-test_data
# Initialize a Survived column to 0
test_one$Survived <- 0
# Set Survived to 1 if Sex equals "female"
test_one$Survived[test_one$Sex =="female"]<-1
predict1<-data.frame(test_one$PassengerId,test_one$Survived)
names(predict1)<-c("PassengerId","Survived")
rownames(predict1)<-NULL
write.csv(predict1,file = "~/Google Drive/Kaggle/Titanic-Disaster/predict1.csv",row.names=FALSE)
PREDICTION 2
test_two <-test_data
# Initialize a Survived column to 0
test_two$Survived <- 0
# Create the column child, and indicate whether child or no child
test_two$Child <-0
test_two$Child[test_two$Age<18] <- 1
summary(train_data$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.42 20.12 28.00 29.70 38.00 80.00 177
summary(train_data$Sex)
## female male
## 314 577
prop.table(table(train_data$Sex, train_data$Survived))
##
## 0 1
## female 0.09090909 0.26150393
## male 0.52525253 0.12233446
prop.table(table(train_data$Sex, train_data$Survived),1)
##
## 0 1
## female 0.2579618 0.7420382
## male 0.8110919 0.1889081
# Set Survived to 1 if Sex equals "female" and age==0
test_two$Survived[test_two$Sex =="female" & test_two$Child== 0]<-1
predict2<-data.frame(test_two$PassengerId,test_two$Survived)
names(predict2)<-c("PassengerId","Survived")
rownames(predict2)<-NULL
write.csv(predict2,file = "~/Google Drive/Kaggle/Titanic-Disaster/predict2.csv",row.names=FALSE)
#Score 0.78469
#level : 1796
PREDICTION 3
#Another one more prediction taking more variables into account
aggregate(Survived~Child + Sex,data=train_data,FUN=sum)
## Child Sex Survived
## 1 0 female 195
## 2 1 female 38
## 3 0 male 86
## 4 1 male 23
aggregate(Survived~Child + Sex,data=train_data,FUN=length)
## Child Sex Survived
## 1 0 female 259
## 2 1 female 55
## 3 0 male 519
## 4 1 male 58
aggregate(Survived ~ Child + Sex, data=train_data, FUN=function(x) {sum(x)/length(x)})
## Child Sex Survived
## 1 0 female 0.7528958
## 2 1 female 0.6909091
## 3 0 male 0.1657033
## 4 1 male 0.3965517
#While the class variable is limited to a manageable 3 values, the fare is
#again a continuous variable that needs to be reduced to something that can
#be easily tabulated. Let’s bin the fares into less than $10, between
#$10 and $20, $20 to $30 and more than $30 and store it to a new variable:
train_data3 <-train_data
train_data3$Fare2 <- '30+'
train_data3$Fare2[train_data3$Fare < 30 & train_data3$Fare >= 20] <- '20-30'
train_data3$Fare2[train_data3$Fare < 20 & train_data3$Fare >= 10] <- '10-20'
train_data3$Fare2[train_data3$Fare < 10] <- '<10'
aggregate(Survived ~ Fare2 + Pclass + Sex, data=train_data3, FUN=function(x) {sum(x)/length(x)})
## Fare2 Pclass Sex Survived
## 1 20-30 1 female 0.8333333
## 2 30+ 1 female 0.9772727
## 3 10-20 2 female 0.9142857
## 4 20-30 2 female 0.9000000
## 5 30+ 2 female 1.0000000
## 6 <10 3 female 0.5937500
## 7 10-20 3 female 0.5813953
## 8 20-30 3 female 0.3333333
## 9 30+ 3 female 0.1250000
## 10 <10 1 male 0.0000000
## 11 20-30 1 male 0.4000000
## 12 30+ 1 male 0.3837209
## 13 <10 2 male 0.0000000
## 14 10-20 2 male 0.1587302
## 15 20-30 2 male 0.1600000
## 16 30+ 2 male 0.2142857
## 17 <10 3 male 0.1115385
## 18 10-20 3 male 0.2368421
## 19 20-30 3 male 0.1250000
## 20 30+ 3 male 0.2400000
#Men in comparison with Women are doing worse irrespective of the class and the
#amount they paid. Women who were in class 3 and having expensive tickets
#are failed to get into boat. So lets use this result for the new prediction
#in our test data
test_three <-test_data
test_three$Survived <- 0
test_three$Survived[test_data$Sex == 'female'] <- 1
test_three$Survived[test_data$Sex == 'female' & test_data$Pclass == 3 & test_data$Fare >= 20] <- 0
predict3<-data.frame(test_three$PassengerId,test_three$Survived)
names(predict3)<-c("PassengerId","Survived")
rownames(predict3)<-NULL
write.csv(predict3,file = "~/Google Drive/Kaggle/Titanic-Disaster/predict3.csv",row.names=FALSE)
PREDICTION 4
install.packages("rpart")
##
## The downloaded binary packages are in
## /var/folders/71/vnhmr1ts6w354s54vd5n1xw80000gn/T//RtmpoXDLSE/downloaded_packages
library(rpart)
my_tree<-rpart(Survived ~ Sex + Age, data = train_data, method = "class")
#plot(my_tree)
#text(my_tree)
#library(rattle) I had lot of trouble installing this... I get this working in the console but when i do knit it gives me problem..So commented it out..
library(rpart.plot)
library(RColorBrewer)
new.fit <- prp(my_tree,snip=TRUE)$obj
## Warning: ignoring snip=TRUE for quartz_off_screen device
# Time to plot your fancy tree
#fancyRpartPlot(my_tree)
# Make your prediction using the test set
my_prediction <- predict(my_tree,test_data,type="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution <- data.frame(PassengerId = test_data$PassengerId, Survived = my_prediction)
# Check that the data frame has 418 entries
nrow(my_solution)
## [1] 418
# Write the solution to a csv file with the name my_solution.csv
write.csv(my_solution,file="~/Google Drive/Kaggle/Titanic-Disaster/decision_tree1.csv",row.names=FALSE)
PREDCTION 5
##Another decision tree prediction
library(rpart)
my_tree2<-rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data = train_data, method = "class")
#plot(my_tree2)
#text(my_tree2)
new.fit <- prp(my_tree2,snip=TRUE)$obj
## Warning: ignoring snip=TRUE for quartz_off_screen device
#library(rattle)
#library(rpart.plot)
library(RColorBrewer)
# plot the tree
#fancyRpartPlot(my_tree2)
# Make prediction using the test set
my_prediction2 <- predict(my_tree2,test_data,type="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution2 <- data.frame(PassengerId = test_data$PassengerId, Survived = my_prediction2)
# Check that the data frame has 418 entries
nrow(my_solution2)
## [1] 418
# Write the solution to a csv file with the name my_solution.csv
write.csv(my_solution2,file="~/Google Drive/Kaggle/Titanic-Disaster/decision_tree2.csv",row.names=FALSE)
PREDICTION 6
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked, data=train_data,
method="class", control=rpart.control( minsplit=2, cp=0 ))
new.fit <- prp(fit,snip=TRUE)$obj
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
## Warning: ignoring snip=TRUE for quartz_off_screen device
#fancyRpartPlot(new.fit)
Some cleaning of data before we get on with this prediction
#Feature Engineering (Playing around with variables to squeeze some insights)
#Wrangling of data
train_data$Name[1]
## [1] Braund, Mr. Owen Harris
## 891 Levels: Abbing, Mr. Anthony ... Zimmerman, Mr. Leo
# To find the number of people with different titles, we add the train and test data
#together
test_data$Child <-0
test_data$Child[test_data$Age<18] <- 1
test_data$Survived <- NA # Test_data did not have the Survived column
combi <- rbind(train_data,test_data)
#names are coded as factors.. Lets make this a character
combi$Name<- as.character(combi$Name)
# Now lets split the string so that we can take out the titles seperately
strsplit(combi$Name[1], split='[,.]') # This is one list
## [[1]]
## [1] "Braund" " Mr" " Owen Harris"
# How do we get the list of list ie. 2nd element of the list - use double square
#bractkets
#[1] "Braund" " Mr" " Owen Harris"
strsplit(combi$Name[1],split='[,.]')[[1]][2] #this gives the 2nd part of the first
## [1] " Mr"
# string
#[1] " Mr"
#Now we need to apply this for all records.. That we can do using apply/sapply commands
combi$Title <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][2]})
#For some of the data, there is a space in front.. so removing it
combi$Title <- sub(' ', '', combi$Title)
#some of the titles have similar meaning but they are written differently. So
#lets combine them using the "in" command
table(combi$Title)
##
## Capt Col Don Dona Dr
## 1 4 1 1 8
## Jonkheer Lady Major Master Miss
## 1 1 2 61 260
## Mlle Mme Mr Mrs Ms
## 2 1 757 197 2
## Rev Sir the Countess
## 8 1 1
#Capt Col Don Dona Dr Jonkheer Lady
#1 4 1 1 8 1 1
#Major Master Miss Mlle Mme Mr Mrs
#2 61 260 2 1 757 197
#Ms Rev Sir theCountess
#2 8 1 1
combi$Title[combi$Title %in% c('Mme', 'Mlle')] <- 'Mlle'
combi$Title[combi$Title %in% c('Capt', 'Don', 'Major', 'Sir')] <- 'Sir'
combi$Title[combi$Title %in% c('Dona', 'Lady', 'theCountess', 'Jonkheer')] <- 'Lady'
#Now we convert this back to factors as we need to use this factor idea in our tree
combi$Title <- factor(combi$Title)
#Now lets look at other variables related to a family size. It will be the addition
#of number of siblings + + his parents+ 1(for the person survived)
combi$FamilySize <- combi$SibSp + combi$Parch + 1
table(combi$FamilySize)
##
## 1 2 3 4 5 6 7 8 11
## 790 235 159 43 22 25 16 8 11
#In order to make sure we have made the Family size correctly, we will also check
#the surname of each Family
combi$Surname <- sapply(combi$Name, FUN=function(x) {strsplit(x, split='[,.]')[[1]][1]})
#We then want to append the FamilySize variable to the front of it, but as we saw
#with factors, string operations need strings. So let’s convert the FamilySize
#variable temporarily to a string and combine it with the Surname to get our
#new FamilyID variable:
combi$FamilyID <- paste(as.character(combi$FamilySize), combi$Surname, sep="")
#Lets move the family size <2 to "small"
combi$FamilyID[combi$FamilySize <= 2] <- 'Small'
#This did not do the job exactly as required
table(combi$FamilyID)
##
## 11Sage 3Abbott 3Appleton 3Beckwith
## 11 3 1 2
## 3Boulos 3Bourke 3Brown 3Caldwell
## 3 3 4 3
## 3Christy 3Collyer 3Compton 3Cornell
## 2 3 3 1
## 3Coutts 3Crosby 3Danbom 3Davies
## 3 3 3 5
## 3Dodge 3Douglas 3Drew 3Elias
## 3 1 3 3
## 3Frauenthal 3Frolicher 3Frolicher-Stehli 3Goldsmith
## 1 1 2 3
## 3Gustafsson 3Hamalainen 3Hansen 3Hart
## 2 2 1 3
## 3Hays 3Hickman 3Hiltunen 3Hirvonen
## 2 3 1 1
## 3Jefferys 3Johnson 3Kink 3Kink-Heilmann
## 2 3 2 2
## 3Klasen 3Lahtinen 3Mallet 3McCoy
## 3 2 3 3
## 3Minahan 3Moubarek 3Nakid 3Navratil
## 1 3 3 3
## 3Newell 3Newsom 3Nicholls 3Peacock
## 1 1 1 3
## 3Peter 3Quick 3Richards 3Rosblom
## 3 3 2 3
## 3Samaan 3Sandstrom 3Silven 3Spedden
## 3 3 1 3
## 3Strom 3Taussig 3Thayer 3Thomas
## 1 3 3 1
## 3Touma 3van Billiard 3Van Impe 3Vander Planke
## 3 3 3 2
## 3Wells 3Wick 3Widener 4Allison
## 3 3 3 4
## 4Backstrom 4Baclini 4Becker 4Carter
## 1 4 4 4
## 4Davidson 4Dean 4Herman 4Hocking
## 1 4 4 2
## 4Jacobsohn 4Johnston 4Laroche 4Renouf
## 1 4 4 1
## 4Vander Planke 4West 5Ford 5Hocking
## 1 4 5 1
## 5Kink-Heilmann 5Lefebre 5Palsson 5Ryerson
## 1 5 5 5
## 6Fortune 6Panula 6Rice 6Richards
## 6 6 6 1
## 6Skoog 7Andersson 7Asplund 8Goodwin
## 6 9 7 8
## Small
## 1025
#Writing it in data.frame
famIDs <- data.frame(table(combi$FamilyID))
famIDs<- famIDs[famIDs$Freq <=2,]
#Now move this
combi$FamilyID[combi$FamilyID %in% famIDs$Var1] <- 'Small'
combi$FamilyID <- factor(combi$FamilyID)
#Now we have the variables required for feature engineering and so we need to split back
#the set to train and test data
train_data <- combi[1:891,]
test_data <- combi[892:1309,]
fit <- rpart(Survived ~ Pclass + Sex + Age + SibSp + Parch + Fare + Embarked + Title +
FamilySize + FamilyID,
data=train_data, method="class")
new.fit <- prp(fit,snip=TRUE)$obj
## Warning: ignoring snip=TRUE for quartz_off_screen device
# Time to plot the fancy tree
#fancyRpartPlot(fit)
# Make your prediction using the test set
my_prediction3 <- predict(fit,test_data,type="class")
# Create a data frame with two columns: PassengerId & Survived. Survived contains your predictions
my_solution3 <- data.frame(PassengerId = test_data$PassengerId, Survived = my_prediction3)
# Check that your data frame has 418 entries
nrow(my_solution3)
## [1] 418
# Write your solution to a csv file with the name my_solution.csv
write.csv(my_solution3,file="~/Google Drive/Kaggle/Titanic-Disaster/decision_tree3.csv",row.names=FALSE)
PREDICTION 7
#Random Forrest
#Lets make several small trees
fit1 <- rpart(Survived ~ Pclass,data=train_data, method="class")
#fancyRpartPlot(fit1)
aggregate(Survived ~ Pclass+Sex, data=train_data, FUN=function(x) {sum(x)/length(x)})
## Pclass Sex Survived
## 1 1 female 0.9680851
## 2 2 female 0.9210526
## 3 3 female 0.5000000
## 4 1 male 0.3688525
## 5 2 male 0.1574074
## 6 3 male 0.1354467
fit2 <- rpart(Survived ~ Sex,data=train_data, method="class")
#fancyRpartPlot(fit2)
aggregate(Survived ~ Sex, data=train_data, FUN=function(x) {sum(x)/length(x)})
## Sex Survived
## 1 female 0.7420382
## 2 male 0.1889081
fit3 <- rpart(Survived ~ Embarked,data=train_data, method="class")
#fancyRpartPlot(fit3)
#############################
#Step 1 : Cleaning the data
#############################
summary(combi$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.17 21.00 28.00 29.88 39.00 80.00 263
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0.17 21.00 28.00 29.88 39.00 80.00 263
#We have 263 NA's in this age variable column
#to fill the columns with age, we use decision tree to predict Age w.r.to all the available variables
Agefit <- rpart(Age ~ Pclass + Sex + SibSp + Parch + Fare + Embarked + Title + FamilySize,
data=combi[!is.na(combi$Age),], method="anova")
# use this tree to predict the missing age
combi$Age[is.na(combi$Age)] <- predict(Agefit, combi[is.na(combi$Age),])
summary(combi$Age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.17 22.00 28.86 29.70 36.50 80.00
#Now Age looks fine and lets find whether there are any other variables having similar problem
summary(combi$Embarked)
## C Q S
## 2 270 123 914
#C Q S
#2 270 123 914
# 2 people have missing data about where they boarded. But as most of them embarked at Southampton(S), we can
# add S to both persons.
which(combi$Embarked == '')
## [1] 62 830
# 62 830
combi$Embarked[c(62,830)] = "S"
combi$Embarked <- factor(combi$Embarked)
summary(combi$Embarked)
## C Q S
## 270 123 916
# C Q S
#270 123 916
#Now lets look at the Fare variable
summary(combi$Fare)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.000 7.896 14.450 33.300 31.280 512.300 1
#Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
# 0.000 7.896 14.450 33.300 31.280 512.300 1
#One data is missing. Lets add the average to this..
which(is.na(combi$Fare))
## [1] 1044
#1044
combi$Fare[1044] <- median(combi$Fare, na.rm=TRUE)
#Step 2 -
#Lot to do... Still understanding from blogs and will be using in some available data