Titanic-Dataset-Analysis-in-R.utf8

#To load the train and test dataset in R, we use following command

train <- read.csv("C:\\Users\\Sweta\\Documents\\Data\\train.csv", header=TRUE)
test <- read.csv("C:\\Users\\Sweta\\Documents\\Data\\test.csv", header=TRUE)

#Add new variable to the dataset: Test data has 418 observations with 10 variables while train data has 891 observations with 11 variables. #“Survived” variable is missing in the test data. To add this, command is:

test.Survived <- data.frame(Survived = rep("None", nrow(test)), test[,])

#Combine data sets: To combine test and train data set row wise, command is

data.combined <- rbind (train, test.Survived)

#to check the data type of variables in R

str(data.combined)

## 'data.frame':    1309 obs. of  12 variables:
##  $ PassengerId: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ Survived   : chr  "0" "1" "1" "1" ...
##  $ Pclass     : int  3 1 3 1 3 3 1 3 3 2 ...
##  $ Name       : chr  "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
##  $ Sex        : chr  "male" "female" "female" "female" ...
##  $ Age        : num  22 38 26 35 35 NA 54 2 27 14 ...
##  $ SibSp      : int  1 1 0 1 0 0 0 3 0 1 ...
##  $ Parch      : int  0 0 0 0 0 0 0 1 2 0 ...
##  $ Ticket     : chr  "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
##  $ Fare       : num  7.25 71.28 7.92 53.1 8.05 ...
##  $ Cabin      : chr  "" "C85" "" "C123" ...
##  $ Embarked   : chr  "S" "C" "S" "S" ...

#To change the data type for pclass from int to factor

data.combined$Pclass <- as.factor(data.combined$Pclass)

#To change the data type for survived from char to factor

data.combined$Survived <- as.factor(data.combined$Survived)

#To analyze the data in tabular format with respect to “Survived” variable

table(data.combined$Survived)

## 
##    0    1 None 
##  549  342  418

#To see the distribution of the data with respect to “Pclass”

table(data.combined$Pclass)

## 
##   1   2   3 
## 323 277 709

#For data visualization, ggplot2 has to be installed

library(ggplot2)

#We make Assumption that Upper class passenger has higher chance of survival rate than others

train$Pclass <- as.factor(train$Pclass)
ggplot(train, aes(x = Pclass, fill = factor(Survived))) + geom_bar() +  xlab("Pclass") + ylab("Total Count") +labs(fill = "Survived")

#Check the name variable in train data set

head(as.character(train$Name))

## [1] "Braund, Mr. Owen Harris"                            
## [2] "Cumings, Mrs. John Bradley (Florence Briggs Thayer)"
## [3] "Heikkinen, Miss. Laina"                             
## [4] "Futrelle, Mrs. Jacques Heath (Lily May Peel)"       
## [5] "Allen, Mr. William Henry"                           
## [6] "Moran, Mr. James"

#To check the unique name in the data.combined set

length(unique(as.character(data.combined$Name)))

## [1] 1307

#Convert the data type of Name as character. #Find the duplicate names in data.combined. #save them in dup.names as character data type

dup.names <- as.character(data.combined[which(duplicated(as.character(data.combined$Name))), "Name"])

#Extract the records from data.combined where name is in the dup.names

data.combined[which(data.combined$Name %in% dup.names),]

##     PassengerId Survived Pclass                 Name    Sex  Age SibSp Parch
## 290         290        1      3 Connolly, Miss. Kate female 22.0     0     0
## 697         697        0      3     Kelly, Mr. James   male 44.0     0     0
## 892         892     None      3     Kelly, Mr. James   male 34.5     0     0
## 898         898     None      3 Connolly, Miss. Kate female 30.0     0     0
##     Ticket   Fare Cabin Embarked
## 290 370373 7.7500              Q
## 697 363592 8.0500              S
## 892 330911 7.8292              Q
## 898 330972 7.6292              Q

#Find the Name which has “Miss.” in the name in data.combined dataset #store them in misses.

library(stringr) 
misses <- data.combined[which(str_detect(data.combined$Name, "Miss.")),]
misses[1:5,]

##    PassengerId Survived Pclass                                 Name    Sex Age
## 3            3        1      3               Heikkinen, Miss. Laina female  26
## 11          11        1      3      Sandstrom, Miss. Marguerite Rut female   4
## 12          12        1      1             Bonnell, Miss. Elizabeth female  58
## 15          15        0      3 Vestrom, Miss. Hulda Amanda Adolfina female  14
## 23          23        1      3          McGowan, Miss. Anna "Annie" female  15
##    SibSp Parch           Ticket    Fare Cabin Embarked
## 3      0     0 STON/O2. 3101282  7.9250              S
## 11     1     1          PP 9549 16.7000    G6        S
## 12     0     0           113783 26.5500  C103        S
## 15     0     0           350406  7.8542              S
## 23     0     0           330923  8.0292              Q

#Find the Name which has “Mrs.” in the name in data.combined #store them in mrses.

mrses <- data.combined[which(str_detect(data.combined$Name, "Mrs.")), ]
mrses[1:5,]

##    PassengerId Survived Pclass
## 2            2        1      1
## 4            4        1      1
## 9            9        1      3
## 10          10        1      2
## 16          16        1      2
##                                                   Name    Sex Age SibSp Parch
## 2  Cumings, Mrs. John Bradley (Florence Briggs Thayer) female  38     1     0
## 4         Futrelle, Mrs. Jacques Heath (Lily May Peel) female  35     1     0
## 9    Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female  27     0     2
## 10                 Nasser, Mrs. Nicholas (Adele Achem) female  14     1     0
## 16                    Hewlett, Mrs. (Mary D Kingcome)  female  55     0     0
##      Ticket    Fare Cabin Embarked
## 2  PC 17599 71.2833   C85        C
## 4    113803 53.1000  C123        S
## 9    347742 11.1333              S
## 10   237736 30.0708              C
## 16   248706 16.0000              S

#Store all records in “males” data frame where sex is male

males <- data.combined[which(train$Sex == "male"), ]
males[1:5,]

##   PassengerId Survived Pclass                           Name  Sex Age SibSp
## 1           1        0      3        Braund, Mr. Owen Harris male  22     1
## 5           5        0      3       Allen, Mr. William Henry male  35     0
## 6           6        0      3               Moran, Mr. James male  NA     0
## 7           7        0      1        McCarthy, Mr. Timothy J male  54     0
## 8           8        0      3 Palsson, Master. Gosta Leonard male   2     3
##   Parch    Ticket    Fare Cabin Embarked
## 1     0 A/5 21171  7.2500              S
## 5     0    373450  8.0500              S
## 6     0    330877  8.4583              Q
## 7     0     17463 51.8625   E46        S
## 8     1    349909 21.0750              S

#Add “Title” variable in the data.combined. #Title contains the title of passenger such as Miss., Mrs. Etc. #We create a function “extractTitle” which will extract the titles from all observations. #These values will be stored in “Title” variable.

extractTitle <- function(Name) {
  name <- as.character(Name)  
  if (length(grep("Miss.", Name)) > 0) {
    return ("Miss.")
  } else if (length(grep("Master.", Name)) > 0) {
    return ("Master.")
  } else if (length(grep("Mrs.", Name)) > 0) {
    return ("Mrs.")
  } else if (length(grep("Mr.", Name)) > 0) {
    return ("Mr.")
  } else {
    return ("Other")
  }
}

titles <- NULL
for (i in 1:nrow(data.combined)) {
  titles <- c(titles, extractTitle(data.combined[i,"Name"]))
}
data.combined$title <- as.factor(titles)

ggplot(data.combined[1:891,], aes(x = title, fill = Survived)) +
  geom_bar() +
  facet_wrap(~Pclass) + 
  ggtitle("Pclass") +
  xlab("Title") +
  ylab("Total Count") +
  labs(fill = "Survived")