Read in training and test data.
train <- read.csv("train.csv",header=TRUE,row.names=1,stringsAsFactors=FALSE)
test <- read.csv("test.csv",header=TRUE,row.names=1,stringsAsFactors=FALSE)
Set aside the dependent variable (“Survived”).
survived <- train$Survived
train <- train[,setdiff(colnames(train),"Survived")]
Show that name and ticket have too many unique values.
length(unique(train$Name))
## [1] 891
length(unique(train$Ticket))
## [1] 681
Show that cabin has too many empty values.
length(which(train$Cabin == ""))
## [1] 687
table(ifelse(train$Cabin == "","Cabin empty","Cabin given"),train$Pclass)
##
## 1 2 3
## Cabin empty 40 168 479
## Cabin given 176 16 12
Show that both train and test have a few zero values for fare, but no or almost no NA values originally. Then, convert to NA if zero.
length(which(train$Fare == 0))
## [1] 15
length(which(is.na(train$Fare) == TRUE))
## [1] 0
length(which(test$Fare == 0))
## [1] 2
length(which(is.na(test$Fare) == TRUE))
## [1] 1
train$Fare[train$Fare == 0] <- NA
test$Fare[test$Fare == 0] <- NA
Show that both train and test have the same few extreme values and next-to-highest values for fare. Also show similar range and distribution.
head(train$Fare[order(train$Fare,decreasing=TRUE)])
## [1] 512.3292 512.3292 512.3292 263.0000 263.0000 263.0000
head(test$Fare[order(test$Fare,decreasing=TRUE)])
## [1] 512.3292 263.0000 263.0000 262.3750 262.3750 262.3750
range(train$Fare,na.rm=TRUE)
## [1] 4.0125 512.3292
range(test$Fare,na.rm=TRUE)
## [1] 3.1708 512.3292
par(mfrow=c(1,2))
hist(train$Fare,labels=TRUE,xlab="Fare",ylab="Passengers",main="Train")
hist(test$Fare,labels=TRUE,xlab="Fare",ylab="Passengers",main="Test")
Show the distribution of fares within each class in the training data, after truncate outliers.
train$Fare[train$Fare > 500 & is.na(train$Fare) == FALSE] <- 263
test$Fare[test$Fare > 500 & is.na(test$Fare) == FALSE] <- 263
par(mfrow=c(2,2))
for(class in 1:3)
{
hist(train$Fare[train$Pclass == class],xlab="Fare",ylab="Passengers",main=paste0("Class=",class),labels=TRUE)
}
Show that very few first and second class passengers came aboard at Queensland.
table(train$Embarked,train$Pclass)
##
## 1 2 3
## 2 0 0
## C 85 17 66
## Q 2 3 72
## S 127 164 353
table(test$Embarked,test$Pclass)
##
## 1 2 3
## C 56 11 35
## Q 1 4 41
## S 50 78 142
Show that first-class passengers who embarked at Cherbourg tended to have higher fares than those at Southampton.
train$Embarked[train$Embarked == ""] <- NA
boxplot(Fare ~ Embarked,xlab="",ylab="Fare",data=train[train$Embarked != "Q" & train$Pclass == 1,],main="Train, 1st class passengers")
boxplot(Fare ~ Embarked,xlab="",ylab="Fare",data=test[test$Embarked != "Q" & test$Pclass == 1,],main="Test, 1st class passengers")
Show that the passengers who embarked at a unknown port are more likely from Cherbourg based on fare.
boxplot(Fare ~ Embarked,xlab="",ylab="Fare",data=train[train$Embarked != "Q" & train$Pclass == 1,],main="Train, 1st class passengers")
abline(h=80,lty=2)
legend("topright",legend="Fare for unknown port",lty=2)
For each sex and class, compare survival rates between ports.
for(class in 1:3)
{
for(sex in c("male","female"))
{
print(paste0("Class=",class,";Sex=",sex))
survival_rates = table(train$Embarked[train$Pclass == class & train$Sex == sex],survived[train$Pclass == class & train$Sex == sex])
print(survival_rates)
print(round(survival_rates[,2]*100/rowSums(survival_rates)))
}
}
## [1] "Class=1;Sex=male"
##
## 0 1
## C 25 17
## Q 1 0
## S 51 28
## C Q S
## 40 0 35
## [1] "Class=1;Sex=female"
##
## 0 1
## C 1 42
## Q 0 1
## S 2 46
## C Q S
## 98 100 96
## [1] "Class=2;Sex=male"
##
## 0 1
## C 8 2
## Q 1 0
## S 82 15
## C Q S
## 20 0 15
## [1] "Class=2;Sex=female"
##
## 0 1
## C 0 7
## Q 0 2
## S 6 61
## C Q S
## 100 100 91
## [1] "Class=3;Sex=male"
##
## 0 1
## C 33 10
## Q 36 3
## S 231 34
## C Q S
## 23 8 13
## [1] "Class=3;Sex=female"
##
## 0 1
## C 8 15
## Q 9 24
## S 55 33
## C Q S
## 65 73 38
For each sex and class, compare survival rates by number of relatives (SibSp + Parch).
num_relatives <- train$SibSp + train$Parch
table(num_relatives)
## num_relatives
## 0 1 2 3 4 5 6 7 10
## 537 161 102 29 15 22 12 6 7
num_relatives[num_relatives > 3] <- 4
for(class in 1:3)
{
for(sex in c("male","female"))
{
print(paste0("Class=",class,";Sex=",sex))
survival_rates = table(num_relatives[train$Pclass == class & train$Sex == sex],survived[train$Pclass == class & train$Sex == sex])
print(survival_rates)
print(round(survival_rates[,2]*100/rowSums(survival_rates)))
}
}
## [1] "Class=1;Sex=male"
##
## 0 1
## 0 50 25
## 1 19 12
## 2 6 5
## 3 0 3
## 4 2 0
## 0 1 2 3 4
## 33 39 45 100 0
## [1] "Class=1;Sex=female"
##
## 0 1
## 0 1 33
## 1 0 39
## 2 0 13
## 3 2 2
## 4 0 4
## 0 1 2 3 4
## 97 100 100 50 100
## [1] "Class=2;Sex=male"
##
## 0 1
## 0 65 7
## 1 14 1
## 2 9 8
## 3 3 1
## 0 1 2 3
## 10 7 47 25
## [1] "Class=2;Sex=female"
##
## 0 1
## 0 3 29
## 1 2 17
## 2 1 13
## 3 0 9
## 4 0 2
## 0 1 2 3 4
## 91 89 93 100 100
## [1] "Class=3;Sex=male"
##
## 0 1
## 0 232 32
## 1 23 5
## 2 17 8
## 3 2 1
## 4 26 1
## 0 1 2 3 4
## 12 18 32 33 4
## [1] "Class=3;Sex=female"
##
## 0 1
## 0 23 37
## 1 14 15
## 2 10 12
## 3 1 5
## 4 24 3
## 0 1 2 3 4
## 62 52 55 83 11