I did this prediction for survival in the Kaggle Titanic dataset about 2 months ago. This model parsed out passengers gender and age information from provided “Title” information achieved a test accuracy of .79904 on Kaggle. This should be able to serve as a brief guideline on how to do simple Kaggle projects for beginners.
First, combine train and test sets for feature transformations, and reset names for easier reference:
combined<-merge(train, test, all.x =T, all.y = T)
names(combined)<-c("id","class","name","sex","age","sibsp","parch","ticket","fare","cabin","embarked","sur")
Next, check the structure of the data:
str(combined)
## 'data.frame': 1309 obs. of 12 variables:
## $ id : int 1 2 3 4 5 6 7 8 9 10 ...
## $ class : int 3 1 3 1 3 3 1 3 3 2 ...
## $ name : Factor w/ 1307 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ sibsp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ ticket : Factor w/ 929 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ cabin : Factor w/ 187 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ embarked: Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
## $ sur : int 0 1 1 1 0 0 0 0 1 1 ...
summary(combined)
## id class name
## Min. : 1 Min. :1.000 Connolly, Miss. Kate : 2
## 1st Qu.: 328 1st Qu.:2.000 Kelly, Mr. James : 2
## Median : 655 Median :3.000 Abbing, Mr. Anthony : 1
## Mean : 655 Mean :2.295 Abbott, Mr. Rossmore Edward : 1
## 3rd Qu.: 982 3rd Qu.:3.000 Abbott, Mrs. Stanton (Rosa Hunt): 1
## Max. :1309 Max. :3.000 Abelson, Mr. Samuel : 1
## (Other) :1301
## sex age sibsp parch
## female:466 Min. : 0.17 Min. :0.0000 Min. :0.000
## male :843 1st Qu.:21.00 1st Qu.:0.0000 1st Qu.:0.000
## Median :28.00 Median :0.0000 Median :0.000
## Mean :29.88 Mean :0.4989 Mean :0.385
## 3rd Qu.:39.00 3rd Qu.:1.0000 3rd Qu.:0.000
## Max. :80.00 Max. :8.0000 Max. :9.000
## NA's :263
## ticket fare cabin embarked
## CA. 2343: 11 Min. : 0.000 :1014 : 2
## 1601 : 8 1st Qu.: 7.896 C23 C25 C27 : 6 C:270
## CA 2144 : 8 Median : 14.454 B57 B59 B63 B66: 5 Q:123
## 3101295 : 7 Mean : 33.295 G6 : 5 S:914
## 347077 : 7 3rd Qu.: 31.275 B96 B98 : 4
## 347082 : 7 Max. :512.329 C22 C26 : 4
## (Other) :1261 NA's :1 (Other) : 271
## sur
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3838
## 3rd Qu.:1.0000
## Max. :1.0000
## NA's :418
The id, class sur should be treated as factors:
combined[,1]<-as.factor(combined[,1])
combined[,2]<-as.factor(combined[,2])
combined[,12]<-as.factor(combined[,12])
There are missing values in some features, and the mice package provides a nice function to detect missing pattern (for small datasets)
md.pattern(combined)
## id class name sex sibsp parch ticket cabin embarked fare age sur
## 714 1 1 1 1 1 1 1 1 1 1 1 1 0
## 177 1 1 1 1 1 1 1 1 1 1 0 1 1
## 331 1 1 1 1 1 1 1 1 1 1 1 0 1
## 86 1 1 1 1 1 1 1 1 1 1 0 0 2
## 1 1 1 1 1 1 1 1 1 1 0 1 0 2
## 0 0 0 0 0 0 0 0 0 1 263 418 682
# the NA's for cabin and embarked are marked as empty values, change them to NA
combined$cabin[which(combined$cabin == "")] <- NA
combined$embarked[which(combined$embarked == "")] <- NA
# check the specific cabin levels
head(levels(combined$cabin))
## [1] "" "A10" "A14" "A16" "A19" "A20"
It seems that cabins are from letters “A to G” + a special T cabin, so they can be regrouped and simplified into one letter-only factor levels
combined$cabin<-as.character(combined$cabin)
cabin.levels<-LETTERS[1:7]
for(i in cabin.levels){
combined$cabin[grep(i, combined$cabin)]<-i
}
combined$cabin<-as.factor(combined$cabin)
There is an empty level in “embarked”, drop the empty level:
combined$embarked<-droplevels(combined$embarked)
missingembarked<-which(is.na(combined$embarked)) #62, 830 were missing. since most people embarked at S, impute with S
combined$embarked[c(62,830)] <- "S"
Missing age can be guessed from person titles. First, we extract unique titles from name:
combined$title<-character(length = nrow(combined))
titles<-c("Mr.","Mrs.","Miss","Master","Special") # special is for Dr., Capt., Major., etc.
# replace all common titles
for(i in titles[1:4]){
combined$title[grep(i, combined$name)] <- i
}
# replace others with all special titles
combined$title[which(combined$title == "")] <- "Special"
combined$title<-factor(combined$title, c("Mr.","Mrs.","Miss", "Master","Special"))
# checkout the data now
table(combined$title)
##
## Mr. Mrs. Miss Master Special
## 758 199 260 61 31
# plot title against age colored by survival
qplot(title, age, color = sur, data = combined, geom = "jitter")
## Warning: Removed 263 rows containing missing values (geom_point).
To impute age, we find mean, median, and mode for each title group
mean.age<-aggregate(combined$age, by = list(combined$title), mean, na.rm = T)
median.age<-aggregate(combined$age, by = list(combined$title), median, na.rm = T)
# a function to calculate mode
Mode <- function(x, na.rm = FALSE) {
if(na.rm){
x = x[!is.na(x)]
}
ux <- unique(x)
return(ux[which.max(tabulate(match(x, ux)))])
}
mode.age<-aggregate(combined$age, by = list(combined$title), Mode, na.rm = T)
# merge the results into one df
dt1<-merge(mean.age, median.age, by = "Group.1", all.x = T)
info<-merge(dt1, mode.age, by = "Group.1", all.x = T)
names(info)<-c("title","mean","median","mode")
# age distribution according to titles
qplot(age, color = title, data = combined, geom = "density")
## Warning: Removed 263 rows containing non-finite values (stat_density).
Now we impute age with these criteria:
# imputation
naage.id<-which(is.na(combined$age))
naage.title<-combined$title[naage.id]
age.imputed<-numeric()
for(i in 1:length(naage.title)){
if(naage.title[i] == "Master"){age.imputed[i] = info[1,4]} else
if(naage.title[i] == "Miss"){age.imputed[i] = info[2,3]} else
if(naage.title[i] == "Mr."){age.imputed[i] = info[3,3]} else
if(naage.title[i] == "Mrs."){age.imputed[i] = info[4,3]} else
{age.imputed[i] = info[5,3]}
}
combined$age.imputed<-combined$age #create a new column for imputed age
combined$age.imputed[naage.id]<-age.imputed
# plot imputed age
qplot(age.imputed, color = title, data = combined, geom = "density")
# squareroot transform age to normality
combined$age.sqrt<-sqrt(combined$age.imputed)
Then, we impute values for fare:
# find the missing observations for fare
missingfare<-which(is.na(combined$fare)) # case 1044 is missing, find which class is this case
combined$class[1044] # it is a 3rd class ticket, find the average fare for 3rd clas
## [1] 3
## Levels: 1 2 3
mean.fare<-aggregate(combined$fare, by = list(combined$class), mean, na.rm = T)
qplot(fare, color = class, data = combined, geom = "density") # it is skewed with a high kurtosis, impute with mode
## Warning: Removed 1 rows containing non-finite values (stat_density).
mode.fare<-aggregate(combined$fare, by = list(combined$class), Mode, na.rm = T) #8.05 for 3rd class
mode.fare
## Group.1 x
## 1 1 26.55
## 2 2 13.00
## 3 3 8.05
combined$fare.imputed<-combined$fare
combined$fare.imputed[1044]<-mode.fare[3,2]
Create a new feature capturing family size: family size = # of parent and child + # of siblings + 1.
combined$fam_size<-combined$sibsp + combined$parch + 1
qplot(fam_size, fill = class, data = combined)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
It is time to make some quick graphs, I like the multiplot function from the Rmisc package, which is a collection of some good utility functions:
# Split data back into train and test sets:
train<-combined[1:891, ]
test<-combined[892:1309,]
# exploratory on categorical features:
p1<-qplot(class, fill = sur, data = train)
p2<-qplot(sex, fill = sur, data = train)
p3<-qplot(embarked, fill = sur, data = train)
p4<-qplot(cabin, fill = sur, data =train) #too many missing values for cabin, not include in the model
multiplot(p1,p2,p3,p4, cols = 2)
# exploratory on continuous features:
p5<-qplot(age.imputed, fill = sur, data = train)
p6<-qplot(fam_size, fill = sur, data = train, binwidth = range(train$sibsp, na.rm =T)[2]/10)
p7<-qplot(fare.imputed, fill = sur, data = train, binwidth = range(train$fare, na.rm = T)[2]/15)
multiplot(p5,p6,p7, cols = 2)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Now it is time to apply the logistic regression algorithm:
# train a logistic regression model
# title covers the information of sex and age already, and it predicts better than ageXsex interaction terms
model<-glm(sur~fam_size+class+title+fare.imputed, family = "binomial", data = train)
summary(model)
##
## Call:
## glm(formula = sur ~ fam_size + class + title + fare.imputed,
## family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3841 -0.6463 -0.4191 0.5538 2.5650
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.126951 0.257566 -0.493 0.62209
## fam_size -0.453264 0.081010 -5.595 2.20e-08 ***
## class2 -0.905097 0.298985 -3.027 0.00247 **
## class3 -1.844631 0.288583 -6.392 1.64e-10 ***
## titleMrs. 3.668322 0.311574 11.774 < 2e-16 ***
## titleMiss 3.216769 0.249955 12.869 < 2e-16 ***
## titleMaster 3.979301 0.488043 8.154 3.53e-16 ***
## titleSpecial 0.329377 0.461721 0.713 0.47562
## fare.imputed 0.005064 0.002696 1.878 0.06033 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1186.66 on 890 degrees of freedom
## Residual deviance: 747.83 on 882 degrees of freedom
## AIC: 765.83
##
## Number of Fisher Scoring iterations: 5
# apply model
prediction<-predict.glm(model, newdata = test, type = "response")
# set cutoff, >.55 survived, <.55 dead
cutoff<-.55
sur<-vector()
for(i in 1:nrow(test)){
if(prediction[i] < cutoff) {sur[i] = 0} else
{sur[i] = 1}
}
result<-test[, c(1,12)]
result[,2]<-sur
names(result)<-c("Passengerid","Survived")
# write the output
write.csv(result, file = "solution.csv", row.names = F)