# Read entire CSV file into a data frame
train <- read.csv("train.csv")
str(train)
## 'data.frame': 891 obs. of 12 variables:
## $ PassengerId: int 1 2 3 4 5 6 7 8 9 10 ...
## $ Survived : int 0 1 1 1 0 0 0 0 1 1 ...
## $ Pclass : int 3 1 3 1 3 3 1 3 3 2 ...
## $ Name : chr "Braund, Mr. Owen Harris" "Cumings, Mrs. John Bradley (Florence Briggs Thayer)" "Heikkinen, Miss. Laina" "Futrelle, Mrs. Jacques Heath (Lily May Peel)" ...
## $ Sex : chr "male" "female" "female" "female" ...
## $ Age : num 22 38 26 35 35 NA 54 2 27 14 ...
## $ SibSp : int 1 1 0 1 0 0 0 3 0 1 ...
## $ Parch : int 0 0 0 0 0 0 0 1 2 0 ...
## $ Ticket : chr "A/5 21171" "PC 17599" "STON/O2. 3101282" "113803" ...
## $ Fare : num 7.25 71.28 7.92 53.1 8.05 ...
## $ Cabin : chr "" "C85" "" "C123" ...
## $ Embarked : chr "S" "C" "S" "S" ...
head(train)
## PassengerId Survived Pclass
## 1 1 0 3
## 2 2 1 1
## 3 3 1 3
## 4 4 1 1
## 5 5 0 3
## 6 6 0 3
## Name Sex Age SibSp Parch
## 1 Braund, Mr. Owen Harris male 22 1 0
## 2 Cumings, Mrs. John Bradley (Florence Briggs Thayer) female 38 1 0
## 3 Heikkinen, Miss. Laina female 26 0 0
## 4 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35 1 0
## 5 Allen, Mr. William Henry male 35 0 0
## 6 Moran, Mr. James male NA 0 0
## Ticket Fare Cabin Embarked
## 1 A/5 21171 7.2500 S
## 2 PC 17599 71.2833 C85 C
## 3 STON/O2. 3101282 7.9250 S
## 4 113803 53.1000 C123 S
## 5 373450 8.0500 S
## 6 330877 8.4583 Q
tail(train)
## PassengerId Survived Pclass Name Sex
## 886 886 0 3 Rice, Mrs. William (Margaret Norton) female
## 887 887 0 2 Montvila, Rev. Juozas male
## 888 888 1 1 Graham, Miss. Margaret Edith female
## 889 889 0 3 Johnston, Miss. Catherine Helen "Carrie" female
## 890 890 1 1 Behr, Mr. Karl Howell male
## 891 891 0 3 Dooley, Mr. Patrick male
## Age SibSp Parch Ticket Fare Cabin Embarked
## 886 39 0 5 382652 29.125 Q
## 887 27 0 0 211536 13.000 S
## 888 19 0 0 112053 30.000 B42 S
## 889 NA 1 2 W./C. 6607 23.450 S
## 890 26 0 0 111369 30.000 C148 C
## 891 32 0 0 370376 7.750 Q
table(train$Survived)
##
## 0 1
## 549 342
table(train$Sex)
##
## female male
## 314 577
survived = table(train$Survived,train$Sex)
rownames(survived) = c("Died", "Survived")
colnames(survived) = c("Female", "Male")
survived
##
## Female Male
## Died 81 468
## Survived 233 109
prop.test(233,314)
##
## 1-sample proportions test with continuity correction
##
## data: 233 out of 314, null probability 0.5
## X-squared = 72.615, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.6892571 0.7887777
## sample estimates:
## p
## 0.7420382
prop.test(109,577)
##
## 1-sample proportions test with continuity correction
##
## data: 109 out of 577, null probability 0.5
## X-squared = 222.12, df = 1, p-value < 2.2e-16
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.1582608 0.2237814
## sample estimates:
## p
## 0.1889081
prop.test(342,891)
##
## 1-sample proportions test with continuity correction
##
## data: 342 out of 891, null probability 0.5
## X-squared = 47.627, df = 1, p-value = 5.154e-12
## alternative hypothesis: true p is not equal to 0.5
## 95 percent confidence interval:
## 0.3519194 0.4167722
## sample estimates:
## p
## 0.3838384
prop.test(c(233,342),c(314,891))
##
## 2-sample test for equality of proportions with continuity correction
##
## data: c(233, 342) out of c(314, 891)
## X-squared = 117.98, df = 1, p-value < 2.2e-16
## alternative hypothesis: two.sided
## 95 percent confidence interval:
## 0.2980682 0.4183315
## sample estimates:
## prop 1 prop 2
## 0.7420382 0.3838384
Clearly we reject the null and can conclude that this difference in survival rate was intentional.