df <- read.csv("titanic_data.csv", sep = ',')
head(df, 10)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## 7 7 0 1
## 8 8 0 3
## 9 9 1 3
## 10 10 1 2
## Name Sex Age SibSp
## 1 Braund, Mr. Owen Harris male 22 1
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1
## 3 Heikkinen, Miss. Laina female 26 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1
## 5 Allen, Mr. William Henry male 35 0
## 6 Moran, Mr. James male NA 0
## 7 McCarthy, Mr. Timothy J male 54 0
## 8 Palsson, Master. Gosta Leonard male 2 3
## 9 Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg) female 27 0
## 10 Nasser, Mrs. Nicholas (Adele Achem) female 14 1
## Parch Ticket Fare Cabin Embarked
## 1 0 A/5 21171 7.2500 S
## 2 0 PC 17599 71.2833 C85 C
## 3 0 STON/O2. 3101282 7.9250 S
## 4 0 113803 53.1000 C123 S
## 5 0 373450 8.0500 S
## 6 0 330877 8.4583 Q
## 7 0 17463 51.8625 E46 S
## 8 1 349909 21.0750 S
## 9 2 347742 11.1333 S
## 10 0 237736 30.0708 C
class(df)
## [1] "data.frame"
str(df)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : Factor w/ 891 levels "Abbing, Mr. Anthony",..: 109 191 358 277 16 559 520 629 417 581 ...
## $ Sex : Factor w/ 2 levels "female","male": 2 1 1 1 2 2 2 2 1 1 ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : Factor w/ 681 levels "110152","110413",..: 524 597 670 50 473 276 86 396 345 133 ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : Factor w/ 148 levels "","A10","A14",..: 1 83 1 57 1 1 131 1 1 1 ...
## $ Embarked : Factor w/ 4 levels "","C","Q","S": 4 2 4 4 4 3 4 4 4 2 ...
Number of Unique Passengers
length(unique(df$PassengerId))
## [1] 891
df$Survived <- as.factor(df$Survived)
levels(df$Survived) <- c("Died", "Survivor")
library(data.table)
df <- data.table(df)
df[, .N, by=.(Sex, Survived)]
## Sex Survived N
## 1: male Died 468
## 2: female Survivor 233
## 3: female Died 81
## 4: male Survivor 109
df[!is.na(df$Age), ][, mean(Age), by=.(Sex, Survived)]
## Sex Survived V1
## 1: male Died 31.61806
## 2: female Survivor 28.84772
## 3: female Died 25.04688
## 4: male Survivor 27.27602