#To load the train and test dataset in R, we use following command
train <- read.csv("C:\\Users\\Sweta\\Documents\\Data\\train.csv", header=TRUE)
test <- read.csv("C:\\Users\\Sweta\\Documents\\Data\\test.csv", header=TRUE)
#Add new variable to the dataset: Test data has 418 observations with 10 variables while train data has 891 observations with 11 variables. #“Survived” variable is missing in the test data. To add this, command is:
test.Survived <- data.frame(Survived = rep("None", nrow(test)), test[,])
#Combine data sets: To combine test and train data set row wise, command is
data.combined <- rbind (train, test.Survived)
#to check the data type of variables in R
str(data.combined)
## 'data.frame': 1309 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : chr "0" "1" "1" "1" ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
#To change the data type for pclass from int to factor
data.combined$Pclass <- as.factor(data.combined$Pclass)
#To change the data type for survived from char to factor
data.combined$Survived <- as.factor(data.combined$Survived)
#To analyze the data in tabular format with respect to “Survived” variable
table(data.combined$Survived)
##
## 0 1 None
## 549 342 418
#To see the distribution of the data with respect to “Pclass”
table(data.combined$Pclass)
##
## 1 2 3
## 323 277 709
#For data visualization, ggplot2 has to be installed
library(ggplot2)
#We make Assumption that Upper class passenger has higher chance of survival rate than others
train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) + geom_bar() + xlab("Pclass") + ylab("Total Count") +labs(fill = "Survived")
#Check the name variable in train data set
head(as.character(train$Name))
## [1] "Braund, Mr. Owen Harris"
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"
## [5] "Allen, Mr. William Henry"
## [6] "Moran, Mr. James"
#To check the unique name in the data.combined set
length(unique(as.character(data.combined$Name)))
## [1] 1307
#Convert the data type of Name as character. #Find the duplicate names in data.combined. #save them in dup.names as character data type
dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])
#Extract the records from data.combined where name is in the dup.names
data.combined[which(data.combined$Name %in% dup.names),]
## PassengerId Survived Pclass Name Sex Age SibSp Parch
## 290 290 1 3 Connolly, Miss. Kate female 22.0 0 0
## 697 697 0 3 Kelly, Mr. James male 44.0 0 0
## 892 892 None 3 Kelly, Mr. James male 34.5 0 0
## 898 898 None 3 Connolly, Miss. Kate female 30.0 0 0
## Ticket Fare Cabin Embarked
## 290 370373 7.7500 Q
## 697 363592 8.0500 S
## 892 330911 7.8292 Q
## 898 330972 7.6292 Q
#Find the Name which has “Miss.” in the name in data.combined dataset #store them in misses.
library(stringr)
misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:5,]
## PassengerId Survived Pclass Name Sex Age
## 3 3 1 3 Heikkinen, Miss. Laina female 26
## 11 11 1 3 Sandstrom, Miss. Marguerite Rut female 4
## 12 12 1 1 Bonnell, Miss. Elizabeth female 58
## 15 15 0 3 Vestrom, Miss. Hulda Amanda Adolfina female 14
## 23 23 1 3 McGowan, Miss. Anna "Annie" female 15
## SibSp Parch Ticket Fare Cabin Embarked
## 3 0 0 STON/O2. 3101282 7.9250 S
## 11 1 1 PP 9549 16.7000 G6 S
## 12 0 0 113783 26.5500 C103 S
## 15 0 0 350406 7.8542 S
## 23 0 0 330923 8.0292 Q
#Find the Name which has “Mrs.” in the name in data.combined #store them in mrses.
mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
mrses[1:5,]
## PassengerId Survived Pclass
## 2 2 1 1
## 4 4 1 1
## 9 9 1 3
## 10 10 1 2
## 16 16 1 2
## Name Sex Age SibSp Parch
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0 2
## 10 Nasser, Mrs. Nicholas (Adele Achem) female 14 1 0
## 16 Hewlett, Mrs. (Mary D Kingcome) female 55 0 0
## Ticket Fare Cabin Embarked
## 2 PC 17599 71.2833 C85 C
## 4 113803 53.1000 C123 S
## 9 347742 11.1333 S
## 10 237736 30.0708 C
## 16 248706 16.0000 S
#Store all records in “males” data frame where sex is male
males <- data.combined[which(train$Sex == "male"), ]
males[1:5,]
## PassengerId Survived Pclass Name Sex Age SibSp
## 1 1 0 3 Braund, Mr. Owen Harris male 22 1
## 5 5 0 3 Allen, Mr. William Henry male 35 0
## 6 6 0 3 Moran, Mr. James male NA 0
## 7 7 0 1 McCarthy, Mr. Timothy J male 54 0
## 8 8 0 3 Palsson, Master. Gosta Leonard male 2 3
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
## 7 0 17463 51.8625 E46 S
## 8 1 349909 21.0750 S
#Add “Title” variable in the data.combined. #Title contains the title of passenger such as Miss., Mrs. Etc. #We create a function “extractTitle” which will extract the titles from all observations. #These values will be stored in “Title” variable.
extractTitle <- function(Name) {
name <- as.character(Name)
if (length(grep("Miss.", Name)) > 0) {
return ("Miss.")
} else if (length(grep("Master.", Name)) > 0) {
return ("Master.")
} else if (length(grep("Mrs.", Name)) > 0) {
return ("Mrs.")
} else if (length(grep("Mr.", Name)) > 0) {
return ("Mr.")
} else {
return ("Other")
}
}
titles <- NULL
for (i in 1:nrow(data.combined)) {
titles <- c(titles, extractTitle(data.combined[i,"Name"]))
}
data.combined$title <- as.factor(titles)
ggplot(data.combined[1:891,], aes(x = title, fill = Survived)) +
geom_bar() +
facet_wrap(~Pclass) +
ggtitle("Pclass") +
xlab("Title") +
ylab("Total Count") +
labs(fill = "Survived")