This paper explains a simple way to clean the Titanic dataset.
From the 2 datasets provided by Kaggle, we have created 4 datasets. The first two datasets will be used to compare the performance of the different models. The last two datasets will be used to submit our predictions to the Kaggle website.
The advantage of using a simple way to clean the datasets provided by Kaggle is the limited computational and research time taken to create our datasets ready for running our models. The inconvenient is of course that the models will have limited results.
In a next paper, we will attempt to improve the performance of the different models by improving the cleaning if the datasets.
The dataset is available on Kaggle website on this link: (https://www.kaggle.com/c/titanic)
library(readr) # to read csv files
train <- read_csv("train.csv")
test <- read_csv("test.csv")
As we need to pre-validate the models we will use to submit prediction to the Kaggle website, we need to split the “train” dataset into a sub training dataset and a validation dataset.
library(caTools) # library to split the dataset
set.seed(123) # set seed for for reproducibility
split = sample.split(train$Survived, SplitRatio = 0.8)
training_set = subset(train, split == TRUE)
validation_set = subset(train, split == FALSE)
final_training_set = train
colSums(is.na(training_set)) # compute the amount of NA per column
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 141
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 546 2
colSums(is.na(validation_set))
## PassengerId Survived Pclass Name Sex Age
## 0 0 0 0 0 36
## SibSp Parch Ticket Fare Cabin Embarked
## 0 0 0 0 141 0
colSums(is.na(test))
## PassengerId Pclass Name Sex Age SibSp
## 0 0 0 0 86 0
## Parch Ticket Fare Cabin Embarked
## 0 0 1 327 0
training_set_clean <- subset(training_set, select = -c(PassengerId,Ticket,Cabin,Name))
validation_set_clean <- subset(validation_set, select = -c(PassengerId,Ticket,Cabin,Name))
test_clean <- subset(test, select = -c(Ticket,Cabin,Name))
final_training_set_clean <- subset(final_training_set, select = -c(PassengerId,Ticket,Cabin,Name))
training_set_clean$Age[is.na(training_set_clean$Age)]<-median(na.omit(training_set_clean$Age)) # replace Missing Age with median
validation_set_clean$Age[is.na(validation_set_clean$Age)]<-median(na.omit(validation_set_clean$Age)) # replace Missing Age with median
test_clean$Age[is.na(test_clean$Age)]<-median(na.omit(test_clean$Age)) # replace Missing Age with median
final_training_set_clean$Age[is.na(final_training_set_clean$Age)]<-median(na.omit(final_training_set_clean$Age))
The weakness of using the median to replace NA without considering other variables to predict the missing ages is that we have 20% of data on the median with is 27.
plot(test_clean$Age)
test_clean$Fare[is.na(test_clean$Fare)]<-median(na.omit(test_clean$Fare))# replace Missing Fare with median
test_clean$Fare[is.na(test_clean$Fare)]<-median(na.omit(test_clean$Fare))
table(training_set_clean$Embarked) # count the number of Ticket per embarkment.
##
## C Q S
## 127 55 529
table(validation_set_clean$Embarked)
##
## C Q S
## 41 22 115
table(test_clean$Embarked)
##
## C Q S
## 102 46 270
training_set_clean$Embarked[is.na(training_set_clean$Embarked)]<-"S" # replace NA by most embarkment count
final_training_set_clean$Embarked[is.na(final_training_set_clean$Embarked)]<-"S" # replace NA by most embarkment count
And now we quickly recheck if we have no Nas, and we are all good.
colSums(is.na(training_set_clean))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
colSums(is.na(validation_set_clean))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
colSums(is.na(test_clean))
## PassengerId Pclass Sex Age SibSp Parch
## 0 0 0 0 0 0
## Fare Embarked
## 0 0
colSums(is.na(final_training_set_clean))
## Survived Pclass Sex Age SibSp Parch Fare Embarked
## 0 0 0 0 0 0 0 0
training_set_clean[4] = scale(training_set_clean[4])#age
training_set_clean[7] = scale(training_set_clean[7])#fare
validation_set_clean[4] = scale(validation_set_clean[4])#age
validation_set_clean[7] = scale(validation_set_clean[7])#fare
test_clean[4] = scale(test_clean[4])#fare
test_clean[7] = scale(test_clean[7])#age
final_training_set_clean[4] = scale(final_training_set_clean[4])#age
final_training_set_clean[7] = scale(final_training_set_clean[7])
Now we are good to go and we save our dataset into csv files for the prediction part
write.csv(training_set_clean, file = "training_set_clean.csv", row.names = FALSE)
write.csv(validation_set_clean, file = "validation_set_clean.csv", row.names = FALSE)
write.csv(test_clean, file = "test_clean.csv", row.names = FALSE)
write.csv(final_training_set_clean, file = "final_training_set_clean.csv", row.names = FALSE)